From 6a99fcce94ac0ff76bcdc502197fb8e0cd1c97c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Mar 2018 00:03:49 +0100 Subject: [PATCH 001/935] Use _Atomic instead of volatile for thread safety where C11 is supported Suggested by dodomorandi in #660 --- driver/level3/level3_gemm3m_thread.c | 7 ++++++- driver/level3/level3_syrk_threaded.c | 7 ++++++- driver/level3/level3_thread.c | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index bfd991ffb..f5e5bca1e 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 65002ae46..d1c476f00 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L +_Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index a1ed8bbb1..47b20f7fa 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,7 +91,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L +_Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; From 40160ff3c1d2427b312f9894795e89f21a48eca0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Mar 2018 00:15:44 +0100 Subject: [PATCH 002/935] Use _Atomic instead of volatile for thread safety where C11 is supported --- lapack/getrf/getrf_parallel.c | 32 +- lapack/getrf/potrf_parallel.c | 664 ++++++++++++++++++++++++++++++++++ 2 files changed, 690 insertions(+), 6 deletions(-) create mode 100644 lapack/getrf/potrf_parallel.c diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index db8c836e0..27faea0cd 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -99,7 +99,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; +#if _STDC_VERSION__ >= 201112L + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; +#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; +#endif blasint *ipiv = (blasint *)args -> c; @@ -177,7 +181,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra /* Non blocking implementation */ typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); @@ -216,9 +225,11 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * FLOAT *sbb= sb; blasint *ipiv = (blasint *)args -> c; - +#if _STDC_VERSION__ >= 201112L + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; +#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; - +#endif if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); @@ -378,7 +389,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else - volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #endif #ifndef COMPLEX @@ -634,8 +650,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; - - volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c new file mode 100644 index 000000000..104022dd9 --- /dev/null +++ b/lapack/getrf/potrf_parallel.c @@ -0,0 +1,664 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + +//The array of job_t may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD +#define USE_ALLOC_HEAP +#endif + + +static FLOAT dm1 = -1.; + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYRK_KERNEL_U +#else +#define KERNEL_FUNC SYRK_KERNEL_L +#endif +#endif + +#ifndef LOWER +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_LT +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#endif +#else +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RR +#endif +#endif + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef LOWER +#define TRANS +#endif + +#ifndef SYRK_LOCAL +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL SYRK_UT +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_LN +#else +#define SYRK_LOCAL SYRK_LT +#endif +#endif + +typedef struct { +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef S +#define S args -> a +#endif +#ifndef A +#define A args -> b +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef N +#define N args -> m +#endif +#ifndef K +#define K args -> k +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda; + BLASLONG m_from, m_to; + + FLOAT *alpha; + FLOAT *a, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + + alpha = (FLOAT *)args -> alpha; + + m_from = range_n[mypos + 0]; + m_to = range_n[mypos + 1]; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); +#endif + + div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; + } + +#ifndef LOWER + TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#else + TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#endif + + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + +#ifndef LOWER + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; +#else + if (min_jj > GEMM_P) min_jj = GEMM_P; +#endif + +#ifndef LOWER + OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (k, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb, + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + a + jjs * lda * COMPSIZE, lda, 0); +#else + ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (min_jj, k, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + sb, + a + jjs * COMPSIZE, lda, 0); +#endif + } + +#ifndef LOWER + for (i = 0; i <= mypos; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#else + for (i = mypos; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#endif + + WMB; + } + + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + /* thread has to wait */ + if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, m_from, xxx); + + if (m_from + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + +#ifndef LOWER + current ++; +#else + current --; +#endif + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, is, xxx); + + if (is + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } +#ifndef LOWER + current ++; +#else + current --; +#endif + } + } + + for (i = 0; i < args -> nthreads; i++) { + if (i != mypos) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + } + + return 0; + } + +static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ + + blas_arg_t newarg; + +#ifndef USE_ALLOC_HEAP + job_t job[MAX_CPU_NUMBER]; +#else + job_t * job = NULL; +#endif + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range[MAX_CPU_NUMBER + 100]; + + BLASLONG num_cpu; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k; + BLASLONG n, n_from, n_to; + int mode, mask; + double dnum; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_REAL; + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; +#endif +#endif + + newarg.m = args -> m; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.alpha = args -> alpha; + +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + + newarg.common = (void *)job; + + n_from = 0; + n_to = args -> m; + +#ifndef LOWER + + range[MAX_CPU_NUMBER] = n_to - n_from; + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + + if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; + +#else + + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = range; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + newarg.nthreads = num_cpu; + + if (num_cpu) { + + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#ifdef USE_ALLOC_HEAP + free(job); +#endif + + return 0; +} + +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); +#endif + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); +#endif + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { +#ifndef USE_SIMPLE_THREADED_LEVEL3 + newarg.m = n - i - bk; + newarg.k = bk; +#ifndef LOWER + newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; +#else + newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; +#endif + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + + thread_driver(&newarg, sa, sb); +#else + +#ifndef LOWER + newarg.m = bk; + newarg.n = n - i - bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); +#endif +#else + newarg.m = n - i - bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + bk + i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); +#endif +#endif + +#endif + } + } + return 0; +} From 68a3c4fca60461f69fbec2da80454fde022b1adc Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 19 Apr 2018 09:05:25 +0000 Subject: [PATCH 003/935] ARM64: Enable Auto Detection of ThunderX2T99 --- cpuid_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index bd7fb7f2d..a42346c88 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -121,7 +121,7 @@ int detect(void) return CPU_VULCAN; else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) return CPU_THUNDERX; - else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ + else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43")) return CPU_THUNDERX2T99; } From 5fcaca6438855fa295e7fa012ffa38f12599ede7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Apr 2018 15:42:13 +0200 Subject: [PATCH 004/935] fork utest depends on CBLAS --- utest/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index e40b3c6db..e071540dc 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -17,11 +17,13 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch +ifneq ($(NO_CBLAS), 1) ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) OBJS += test_fork.o endif endif +endif all : run_test From 625c74a38f481e8ed818334abffd493448f77ebd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Apr 2018 15:43:59 +0200 Subject: [PATCH 005/935] fork utest depends on CBLAS --- utest/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1b426afe7..77a42d84f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -25,6 +25,7 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms +if (NOT NO_CBLAS) if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) set(OpenBLAS_utest_src @@ -33,6 +34,7 @@ set(OpenBLAS_utest_src ) endif() endif() +endif() if (NOT NO_LAPACK) set(OpenBLAS_utest_src From 9c5518319a1370984abe9a2a55a5ebeb1deeccf5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Apr 2018 20:20:04 +0200 Subject: [PATCH 006/935] Revert "Fix 32bit HASWELL builds" --- kernel/Makefile.L3 | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4284fbfa0..066426396 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,8 @@ USE_TRMM = 1 endif ifeq ($(CORE), HASWELL) -ifeq ($(ARCH), x86_64) USE_TRMM = 1 endif -endif ifeq ($(CORE), ZEN) USE_TRMM = 1 From 8a3b6fa108b15331c2af8777d1ea0206f85673b8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Apr 2018 19:05:49 +0200 Subject: [PATCH 007/935] =?UTF-8?q?Use=20generic=20zrot.c=20on=20ppc64/POW?= =?UTF-8?q?ER6=20to=20work=20around=20utest=20failure=20from=20=E2=80=A6?= =?UTF-8?q?=20(#1535)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Use generic C implementation of zrot on ppc64/POWER6 to work around utest failure from #1469 --- kernel/power/KERNEL.POWER6 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/power/KERNEL.POWER6 b/kernel/power/KERNEL.POWER6 index 344b205fe..e6d2c9a51 100644 --- a/kernel/power/KERNEL.POWER6 +++ b/kernel/power/KERNEL.POWER6 @@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S + +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c From 125343cc886accee268b03f020c09658cff37509 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Apr 2018 22:39:50 +0200 Subject: [PATCH 008/935] Drop test for zero incx,incy in armv7 AXPY ...to pass the related utest (see #1469) --- kernel/arm/axpy_vfp.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 37515f399..c35b8aece 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble axpy_kernel_L999 - +/* cmp INC_X, #0 beq axpy_kernel_L999 cmp INC_Y, #0 beq axpy_kernel_L999 - +*/ cmp INC_X, #1 bne axpy_kernel_S_BEGIN From 2d0929fa7c969cbe8f7dcbf3e5b16ef1301dc6a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Apr 2018 22:43:00 +0200 Subject: [PATCH 009/935] Move the test for zero incx,incy in ARMV7 ROT to pass the related utest (see #1469) --- kernel/arm/rot_vfp.S | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index 25f563690..ea296dbc5 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble rot_kernel_L999 - +/* cmp INC_X, #0 beq rot_kernel_L999 cmp INC_Y, #0 beq rot_kernel_L999 - +*/ cmp INC_X, #1 bne rot_kernel_S_BEGIN @@ -584,6 +584,12 @@ rot_kernel_S1: rot_kernel_S10: KERNEL_S1 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 subs I, I, #1 bne rot_kernel_S10 From a8ed428bab10bc595493c1c3c029a5e8d6f25637 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 25 Apr 2018 22:35:46 +0200 Subject: [PATCH 010/935] Disable multithreading in ztrmv BLAS-Tester shows that the same problem exists as with DTRMV (issue #1332) --- interface/ztrmv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 4c47e9e91..0e16632e0 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } else nthreads = 1; +/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ + nthreads = 1; + if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } From 941ad280a8626adc621e41188f513e3fc8ab1e10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 25 Apr 2018 22:50:10 +0200 Subject: [PATCH 011/935] Fix typo in MIPS P5600 complex ASUM code selection --- kernel/mips/KERNEL.P5600 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a16704d5..1ab193069 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c else SASUMKERNEL = ../mips/asum.c DASUMKERNEL = ../mips/asum.c -CASUMKERNEL = ../mips/asum.c -ZASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c endif ifdef HAVE_MSA @@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -endif \ No newline at end of file +endif From 1b83341d194b9d8f75ec724b0c5ae64144ca3108 Mon Sep 17 00:00:00 2001 From: Zhiyong Dang Date: Tue, 24 Apr 2018 10:34:53 +0800 Subject: [PATCH 012/935] Fix race condition in blas_server_omp.c Change-Id: Ic896276cd073d6b41930c7c5a29d66348cd1725d --- Makefile.rule | 7 +++ Makefile.system | 6 +++ cmake/system.cmake | 6 +++ common.h | 2 +- driver/others/blas_server_omp.c | 91 ++++++++++++++++++++++++--------- 5 files changed, 86 insertions(+), 26 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 62bf63df4..0ce4c40a8 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,13 @@ VERSION = 0.3.0.dev # automatically detected by the the script. # NUM_THREADS = 24 +# If you have enabled USE_OPENMP and your application would call +# OpenBLAS's caculation API in multi threads, please comment it in. +# This flag define how many OpenBLAS's caculation API can actually +# run in parallel. If more number threads call OpenBLAS's caculation API, +# it would wait former API finish. +# NUM_PARALLEL = 2 + # if you don't need to install the static library, please comment it in. # NO_STATIC = 1 diff --git a/Makefile.system b/Makefile.system index 142cb420f..463b857b8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -184,6 +184,10 @@ endif endif +ifndef NUM_PARALLEL +NUM_PARALLEL = 1 +endif + ifndef NUM_THREADS NUM_THREADS = $(NUM_CORES) endif @@ -961,6 +965,8 @@ endif CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) +CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) + ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif diff --git a/cmake/system.cmake b/cmake/system.cmake index 3fdd9390c..645895671 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING) endif() +if (NOT DEFINED NUM_PARALLEL) + set(NUM_PARALLEL 1) +endif() + if (NOT DEFINED NUM_THREADS) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) # HT? @@ -224,6 +228,8 @@ endif () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () diff --git a/common.h b/common.h index 5a599a5af..86c33b2fd 100644 --- a/common.h +++ b/common.h @@ -179,7 +179,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 8d62a8125..868db3b1d 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,6 +36,13 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +#if _STDC_VERSION__ >= 201112L +#ifndef _Atomic +#define _Atomic volatile +#endif +#include +#endif +#include #include #include //#include @@ -49,11 +56,16 @@ int blas_server_avail = 0; -static void * blas_thread_buffer[MAX_CPU_NUMBER]; +static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; +#if _STDC_VERSION__ >= 201112L +static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#else +static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#endif void goto_set_num_threads(int num_threads) { - int i=0; + int i=0, j=0; if (num_threads < 1) num_threads = blas_num_threads; @@ -68,15 +80,17 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); //adjust buffer for each thread - for(i=0; i mode & BLAS_PTHREAD) == 0)) { pos = omp_get_thread_num(); - buffer = blas_thread_buffer[pos]; + buffer = blas_thread_buffer[buf_index][pos]; //fallback if(buffer==NULL) { @@ -291,7 +309,7 @@ static void exec_threads(blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){ - BLASLONG i; + BLASLONG i, buf_index; if ((num <= 0) || (queue == NULL)) return 0; @@ -302,6 +320,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ } #endif + while(true) { + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { +#if _STDC_VERSION__ >= 201112L + _Bool inuse = false; + if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { +#else + if(blas_buffer_inuse[i] == false) { + blas_buffer_inuse[i] = true; +#endif + buf_index = i; + break; + } + } + if(i != MAX_PARALLEL_NUMBER) + break; + } + #pragma omp parallel for schedule(static) for (i = 0; i < num; i ++) { @@ -309,9 +344,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ queue[i].position = i; #endif - exec_threads(&queue[i]); + exec_threads(&queue[i], buf_index); } +#if _STDC_VERSION__ >= 201112L + atomic_store(&blas_buffer_inuse[buf_index], false); +#else + blas_buffer_inuse[buf_index] = false; +#endif + return 0; } From 894433a7c71fba89b41af08acdd8fea7b48cc666 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Apr 2018 12:08:06 +0200 Subject: [PATCH 013/935] Update Makefile.rule --- Makefile.rule | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 0ce4c40a8..12734464b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -61,10 +61,10 @@ VERSION = 0.3.0.dev # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call -# OpenBLAS's caculation API in multi threads, please comment it in. -# This flag define how many OpenBLAS's caculation API can actually -# run in parallel. If more number threads call OpenBLAS's caculation API, -# it would wait former API finish. +# OpenBLAS's calculation API from multi threads, please comment it in. +# This flag defines how many instances of OpenBLAS's calculation API can +# actually run in parallel. If more threads call OpenBLAS's calculation API, +# they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 # if you don't need to install the static library, please comment it in. From 26ce518d4605db37083404615268b2341340ecb4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Apr 2018 14:34:33 +0200 Subject: [PATCH 014/935] Avoid out of bounds reads from blas_quick_divide_table on big systems Should fix #1541 --- common_x86_64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index bee88d3ce..0542653a1 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -195,7 +195,9 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ unsigned int result; if (y <= 1) return x; - + + if (y > 64) return x/y; + y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); From 8145ecd70bdfae44f62b5ff9a9e0ee427a2db3db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Apr 2018 14:38:55 +0200 Subject: [PATCH 015/935] Avoid out-of-bounds reads from blas_quick_divide_table on big systems --- common_x86.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_x86.h b/common_x86.h index 4363fb2f4..de014064e 100644 --- a/common_x86.h +++ b/common_x86.h @@ -179,6 +179,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ return result; #else + if ( y > 64) { + result = x/y; + return result; + } y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); From c1eb06e102f4598efee7f766bf0142653f8c8f73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Apr 2018 14:40:12 +0200 Subject: [PATCH 016/935] Update common_x86_64.h --- common_x86_64.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index 0542653a1..a145abc14 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -196,7 +196,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ if (y <= 1) return x; - if (y > 64) return x/y; + if (y > 64) { + result = x / y; + return result; + } y = blas_quick_divide_table[y]; From e93355e5e1fe1b00a7a9587118c5d3b58ce94922 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 14:43:08 +0200 Subject: [PATCH 017/935] Omit the table overflow check when building for small systems --- common_x86.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common_x86.h b/common_x86.h index de014064e..75b1e1247 100644 --- a/common_x86.h +++ b/common_x86.h @@ -178,11 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ result = x/y; return result; #else - +#if (MAX_CPU_NUMBER > 64) if ( y > 64) { result = x/y; return result; - } + } +#endif + y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); From d0c0506588281b34717a3e7b17e9cc2c4a5cef8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 14:44:50 +0200 Subject: [PATCH 018/935] Omit the divide table overflow check on small systems --- common_x86_64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index a145abc14..9d0ef4e75 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -195,11 +195,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ unsigned int result; if (y <= 1) return x; - + +#if (MAX_CPU_NUMBER > 64) if (y > 64) { result = x / y; return result; } +#endif y = blas_quick_divide_table[y]; From 3af1b5c805a5831d20dcae416d362e7e87515e53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:12:25 +0200 Subject: [PATCH 019/935] Make cpuid_mips compile again and add 1004K cpu --- cpuid_mips.c | 58 ++++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index 15c58959e..c09902936 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_P5600 1 +#define CPU_1004K 2 static char *cpuname[] = { "UNKOWN", - "P5600" + "P5600", + "1004K" }; int detect(void){ @@ -90,7 +92,7 @@ int detect(void){ if (!strncmp("cpu", buffer, 3)){ p = strchr(buffer, ':') + 2; #if 0 - fprintf(stderr, "%s\n", p); + fprintf(stderr, "%s \n", p); #endif break; } @@ -99,43 +101,13 @@ int detect(void){ fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ + if (strstr(p, "5600")) { + return CPU_P5600; + } else if (strstr(p, "1004K")) { + return CPU_1004K; + } else return CPU_UNKNOWN; } - } - //Check model name for Loongson3 - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("model name", buffer, 10)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } - } #endif return CPU_UNKNOWN; } @@ -149,7 +121,7 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_P5600){ + if(detect()==CPU_P5600|| detect()==CPU_1004K){ printf("P5600"); }else{ printf("UNKNOWN"); @@ -170,6 +142,14 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); + } else if (detect()==CPU_1004K) { + printf("#define MIPS1004K\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 26144\n"); + printf("#define DTB_DEFAULT_ENTRIES 8\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define UNKNOWN\n"); } @@ -178,6 +158,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_P5600) { printf("p5600\n"); + } else if (detect()==CPU_1004K) { + printf("1004K\n"); }else{ printf("mips\n"); } From d94d7baf7ea5010af47a71a4e01febb08c0d535c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:17:26 +0200 Subject: [PATCH 020/935] Add mips32r2 api target --- Makefile.prebuild | 4 ++++ Makefile.system | 9 +++++++-- param.h | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index daa556f65..a366004a1 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -17,6 +17,10 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif +ifeq ($(TARGET), 1004K) +TARGET_FLAGS = -mips32r2 +endif + ifeq ($(TARGET), P5600) TARGET_FLAGS = -mips32r5 endif diff --git a/Makefile.system b/Makefile.system index 142cb420f..fdc408781 100644 --- a/Makefile.system +++ b/Makefile.system @@ -564,9 +564,14 @@ CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif +ifeq ($(CORE), 1004K) +CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +endif + ifeq ($(CORE), P5600) -CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) -FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +CCOMMON_OPT += -mips32r5 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r5 $(MSA_FLAGS) endif ifeq ($(CORE), I6400) diff --git a/param.h b/param.h index 189cdc4a0..4227d548e 100644 --- a/param.h +++ b/param.h @@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 From 9d5098dbc94cf3bfdc8e9e85043cf285d27cf0da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:20:44 +0200 Subject: [PATCH 021/935] Add MIPS 1004K target (Mediatek MT7621 SOC) --- kernel/mips/KERNEL.1004K | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/mips/KERNEL.1004K diff --git a/kernel/mips/KERNEL.1004K b/kernel/mips/KERNEL.1004K new file mode 100644 index 000000000..67135356e --- /dev/null +++ b/kernel/mips/KERNEL.1004K @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.P5600 From 018f2dad27c764d912fb5ad6cf8bf560f05f2d63 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:25:32 +0200 Subject: [PATCH 022/935] Switch mips32 target to USE_TRMM to fix complex TRMM --- kernel/Makefile.L3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 066426396..4d2999b67 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -20,6 +20,10 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif +ifeq ($(ARCH), mips) +USE_TRMM = 1 +endif + ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif From 73cc321190a5c1ba6004ecfa6df8b19321b3ed49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:27:56 +0200 Subject: [PATCH 023/935] Add MIPS 1004K target --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index d40545cf8..aeeaa9ede 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -56,6 +56,7 @@ CELL 3.MIPS CPU: P5600 +1004K 4.MIPS64 CPU: SICORTEX From 71051259e060abc797eb59a6cc718c2f2dd2f1d6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:37:06 +0200 Subject: [PATCH 024/935] Restore compiler options for mips P5600 target --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index fdc408781..f2fdc5c4b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -570,8 +570,8 @@ FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) endif ifeq ($(CORE), P5600) -CCOMMON_OPT += -mips32r5 $(MSA_FLAGS) -FCOMMON_OPT += -mips32r5 $(MSA_FLAGS) +CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) endif ifeq ($(CORE), I6400) From 5966fd52a23683a10995202dba3e781e9dfcbf9f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:36:56 +0200 Subject: [PATCH 025/935] Drop C-style "L" suffix from OPENMP version number in check --- lapack-netlib/SRC/chetrd_hb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 6645121c1..91806bb1d 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -512,7 +512,7 @@ C END IF * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) From 1a8e487c4a88ef0759efc2d13b9ff3c825a7a57c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:38:25 +0200 Subject: [PATCH 026/935] Drop C-style "L" suffix from OPENMP version number in check --- lapack-netlib/SRC/dsytrd_sb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index d1ccc1a89..4ca0507e4 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -481,7 +481,7 @@ * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) From 9795adc7efb176afb72103ddfd447f92c2579387 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:39:42 +0200 Subject: [PATCH 027/935] Drop C-style "L" suffix from OPENMP version number in check --- lapack-netlib/SRC/zhetrd_hb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 7b623481b..508afca06 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -512,7 +512,7 @@ C END IF * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) From 65b8a5c5d876c25bc1387c7228535e6c7d3147ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:47:10 +0200 Subject: [PATCH 028/935] Update compiler flag for openmp use with ICC The deprecated -openmp option was finally removed in favor of -qopenmp or -fopenmp, picking the latter to stay compatible with Intel compiler versions before 2015 (when -q options were introduced). Fixes #1546 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 142cb420f..3a3e9f510 100644 --- a/Makefile.system +++ b/Makefile.system @@ -433,7 +433,7 @@ CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), INTEL) -CCOMMON_OPT += -openmp +CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), PGI) From d2b9389f1b49c8dee358f3e7211ac4ac707f0dd4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:55:37 +0200 Subject: [PATCH 029/935] Fixes for ifort 2018 1. the already deprecated -openmp option was removed in 2018, switch to -fopenmp 2. add leading blank in search for "zho_ge__" symbol to work around misleading tags in the 2018 assembly Expected to fix #1548 --- f_check | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/f_check b/f_check index 941a9a5c4..997e02393 100644 --- a/f_check +++ b/f_check @@ -97,7 +97,7 @@ if ($compiler eq "") { if ($data =~ /Intel/) { $vendor = INTEL; - $openmp = "-openmp"; + $openmp = "-fopenmp"; } if ($data =~ /Sun Fortran/) { @@ -127,7 +127,7 @@ if ($compiler eq "") { # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; - if ($data =~ /zho_ge__/) { + if ($data =~ / zho_ge__/) { $need2bu = 1; } } @@ -155,7 +155,7 @@ if ($compiler eq "") { if ($compiler =~ /ifort/) { $vendor = INTEL; $bu = "_"; - $openmp = "-openmp"; + $openmp = "-fopenmp"; } if ($compiler =~ /pathf/) { From 193f8356622c85bd494931e54f7efe379e296f88 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 May 2018 12:34:09 +0200 Subject: [PATCH 030/935] Change -openmp to -fopenmp for ifort entry as well --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 3a3e9f510..1fe7d9d3d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -713,7 +713,7 @@ FCOMMON_OPT += -i8 endif endif ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp +FCOMMON_OPT += -fopenmp endif endif From d7d950fcf29c30f6611a247cf8fde4a518286b41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 May 2018 13:15:42 +0200 Subject: [PATCH 031/935] LAPACKE fixes from lapack PR249 Copied from Reference-LAPACK/lapack#249, this fixes out-of-bounds memory accesses in the nancheck calls of the LAPACKE lacgv, lassq,larfg,larfb,larfx and mtr functions --- lapack-netlib/LAPACKE/src/lapacke_clacgv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_clarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_clarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_clarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_classq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cunmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cupmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dlarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_dlarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dlarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_dlassq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dopmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_slarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_slarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_slarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_slassq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sopmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlacgv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_zlarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_zlassq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zunmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zupmtr.c | 2 +- 26 files changed, 122 insertions(+), 94 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacgv.c b/lapack-netlib/LAPACKE/src/lapacke_clacgv.c index 0014906ed..9a77c8ec0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clacgv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clacgv.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n, x, incx ) ) { return -2; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index 18e24509d..3aeb0d7e4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); return -8; } - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfg.c b/lapack-netlib/LAPACKE/src/lapacke_clarfg.c index 0381a42bc..9e852a406 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfg.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha, if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfx.c b/lapack-netlib/LAPACKE/src/lapacke_clarfx.c index 977e283e1..786c21412 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfx.c @@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, lapack_complex_float tau, lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_clarfx", -1 ); return -1; @@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_c_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_c_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_classq.c b/lapack-netlib/LAPACKE/src/lapacke_classq.c index b8f231dbb..e4d746c5a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_classq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_classq.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c index 1864c4121..d9fb2dca0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c index 51f6d8276..ba026ae68 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index 55c26f4b6..a1f49dde1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); return -8; } - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c index 0f627b323..df401c41d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c @@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x, if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_d_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c index ab4a58e76..7b7b7201e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c @@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, lapack_int n, const double* v, double tau, double* c, lapack_int ldc, double* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dlarfx", -1 ); return -1; @@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_d_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_d_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlassq.c b/lapack-netlib/LAPACKE/src/lapacke_dlassq.c index a564240d4..0e096b6d4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlassq.c @@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_d_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c b/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c index 93d3d3d30..7fbfb11fd 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c @@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c index 05e4c57c8..db75a6609 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index 72fa75ef1..0ebdc931a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); return -8; } - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfg.c b/lapack-netlib/LAPACKE/src/lapacke_slarfg.c index 295277387..ea9a83575 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfg.c @@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x, if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_s_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfx.c b/lapack-netlib/LAPACKE/src/lapacke_slarfx.c index 426137815..c2b797a98 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfx.c @@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, lapack_int n, const float* v, float tau, float* c, lapack_int ldc, float* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_slarfx", -1 ); return -1; @@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_s_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_s_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slassq.c b/lapack-netlib/LAPACKE/src/lapacke_slassq.c index 668289e18..3e265e359 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slassq.c @@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_s_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c b/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c index 333789837..bf8eed4f9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c @@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c index 5a9d44138..9f0e9fddf 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c b/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c index 3b1130ba5..cd412dc24 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n, x, incx ) ) { return -2; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 6ea4960f3..4fc2eb0ab 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); return -8; } - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c index 14e587fcc..a566a08cb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha, if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c index 1dd1f5204..b4ebf727e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c @@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, lapack_complex_double tau, lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zlarfx", -1 ); return -1; @@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_z_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_z_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlassq.c b/lapack-netlib/LAPACKE/src/lapacke_zlassq.c index a218c9b62..b8972b974 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlassq.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c index f8936cd5a..433385440 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c b/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c index d735c5561..80bbd9529 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { return -8; } } From 3716267124854b4da70b6ddadea5084c66971648 Mon Sep 17 00:00:00 2001 From: Zhiyong Dang Date: Fri, 11 May 2018 12:15:08 +0800 Subject: [PATCH 032/935] Change _STDC_VERSION__ to __STDC_VERSION__ Change-Id: Id3fa4e8d9eedd4ef7230df69b611e7f397301a42 --- driver/level3/level3_gemm3m_thread.c | 2 +- driver/level3/level3_syrk_threaded.c | 2 +- driver/level3/level3_thread.c | 2 +- driver/others/blas_server_omp.c | 8 ++++---- lapack/getrf/getrf_parallel.c | 10 +++++----- lapack/getrf/potrf_parallel.c | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index f5e5bca1e..4903aa5bd 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,7 @@ #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index d1c476f00..574f825b0 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,7 @@ #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 47b20f7fa..4ab1ee8cc 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,7 +91,7 @@ #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 868db3b1d..cc00092cd 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,7 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L #ifndef _Atomic #define _Atomic volatile #endif @@ -57,7 +57,7 @@ int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #else static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; @@ -322,7 +322,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ while(true) { for(i=0; i < MAX_PARALLEL_NUMBER; i++) { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Bool inuse = false; if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { #else @@ -347,7 +347,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ exec_threads(&queue[i], buf_index); } -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L atomic_store(&blas_buffer_inuse[buf_index], false); #else blas_buffer_inuse[buf_index] = false; diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 4e742b994..591ce4a99 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -119,7 +119,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; #else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; @@ -201,7 +201,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra /* Non blocking implementation */ typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile @@ -246,7 +246,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; BLASLONG jw; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; #else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; @@ -452,7 +452,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile @@ -728,7 +728,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c index 104022dd9..c2fee6bd1 100644 --- a/lapack/getrf/potrf_parallel.c +++ b/lapack/getrf/potrf_parallel.c @@ -101,7 +101,7 @@ static FLOAT dm1 = -1.; #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile From 53457f222fa6b553be313324911ea9f5c97e1db7 Mon Sep 17 00:00:00 2001 From: "zhiyong.dang" Date: Fri, 11 May 2018 00:13:16 -0700 Subject: [PATCH 033/935] move _Atomic define to common.h --- common.h | 6 ++++++ driver/others/blas_server_omp.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common.h b/common.h index 86c33b2fd..123e3dee7 100644 --- a/common.h +++ b/common.h @@ -649,6 +649,12 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif +#if (__STDC_VERSION__ >= 201112L) +#ifndef _Atomic +#define _Atomic volatile +#endif +#include +#endif #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index cc00092cd..fccdb4320 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,12 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#if __STDC_VERSION__ >= 201112L -#ifndef _Atomic -#define _Atomic volatile -#endif -#include -#endif #include #include #include From 41ae8e8d677cd85ddd15ad5be723bafcd88a9d62 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 May 2018 12:11:38 +0200 Subject: [PATCH 034/935] Add threading and OpenMP information to output For #1416 and #1529, more information about the options OpenBLAS was built with is needed. Additionally we may want to add this data to the openblas.pc file (but not all projects use pkgconfig, and as far as I am aware the cmake module for accessing it does not make such "private" declarations available) --- driver/others/openblas_get_config.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 7d041b907..87a27712f 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -54,6 +54,9 @@ static char* openblas_config_str="" #ifdef NO_AFFINITY "NO_AFFINITY " #endif +#ifdef USE_OPENMP + "USE_OPENMP " +#endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif @@ -61,18 +64,23 @@ static char* openblas_config_str="" #ifdef DYNAMIC_ARCH char *gotoblas_corename(); -static char tmp_config_str[256]; #endif +static char tmp_config_str[256]; +int openblas_get_parallel(); char* CNAME() { -#ifndef DYNAMIC_ARCH - return openblas_config_str; -#else +char tmpstr[20]; strcpy(tmp_config_str, openblas_config_str); +#ifdef DYNAMIC_ARCH strcat(tmp_config_str, gotoblas_corename()); - return tmp_config_str; #endif +if (openblas_get_parallel() == 0) + sprintf(tmpstr, " SINGLE_THREADED"); +else + snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); + strcat(tmp_config_str, tmpstr); + return tmp_config_str; } @@ -83,3 +91,4 @@ char* openblas_get_corename() { return gotoblas_corename(); #endif } + From a07843bc938b0595f2a453e9ae276c5a781504f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 May 2018 22:11:27 +0200 Subject: [PATCH 035/935] Overwrite any pre-existing openblas.pc rather than append to it --- Makefile.install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 9ce5ceae6..6176fb0ff 100644 --- a/Makefile.install +++ b/Makefile.install @@ -96,7 +96,7 @@ endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" From 7d7564568cb6641e78b17e2d31c29a2162ce6db1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 May 2018 00:09:35 +0200 Subject: [PATCH 036/935] Add build-time configuration options to pkgconfig file --- cmake/openblas.pc.in | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 113ba8526..35973b09b 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,6 +1,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ +openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ From eb9b021d3890429a41823dc3d90eb0d11c0a6d6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 May 2018 00:10:15 +0200 Subject: [PATCH 037/935] Add build-time configuration options to pkgconfig file --- Makefile.install | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 9ce5ceae6..21c3c9e22 100644 --- a/Makefile.install +++ b/Makefile.install @@ -96,8 +96,9 @@ endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" From 893b535540bb71ad766ca7d56d819630e91a8715 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 15 May 2018 14:42:12 +0200 Subject: [PATCH 038/935] Use correct data type for initializers of v2f64, v4f32 Fixes #1561 --- kernel/mips/dgemv_n_msa.c | 4 ++-- kernel/mips/sgemv_n_msa.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/mips/dgemv_n_msa.c b/kernel/mips/dgemv_n_msa.c index 82c3a96cf..380b94d06 100644 --- a/kernel/mips/dgemv_n_msa.c +++ b/kernel/mips/dgemv_n_msa.c @@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v2f64 v_alpha; - v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0; + v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0}; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; - v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; + v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0}; v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); diff --git a/kernel/mips/sgemv_n_msa.c b/kernel/mips/sgemv_n_msa.c index e1ecb5473..66e3adebf 100644 --- a/kernel/mips/sgemv_n_msa.c +++ b/kernel/mips/sgemv_n_msa.c @@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, FLOAT *y_org = y; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0; + v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0}; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; - v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; + v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0}; v_alpha = COPY_FLOAT_TO_VECTOR(alpha); From 7a7619af6df1fc7754cd30ff8310e1c24bcee7bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 May 2018 11:40:08 +0200 Subject: [PATCH 039/935] Revert changes from PR#1419 at least one of these changes apparently is an oversimplification, leading to TRMM breakage on some platforms as observed in #1563 --- kernel/generic/trmm_ltcopy_2.c | 32 +++++++++++----- kernel/generic/trmm_utcopy_16.c | 67 ++++++++++++++++++--------------- kernel/generic/trmm_utcopy_2.c | 37 +++++++++++------- kernel/generic/trmm_utcopy_4.c | 41 +++++++++++--------- kernel/generic/trsm_ltcopy_4.c | 2 +- kernel/generic/ztrmm_ltcopy_2.c | 46 ++++++++++++++++++---- kernel/generic/ztrsm_utcopy_1.c | 2 +- kernel/generic/ztrsm_utcopy_2.c | 4 +- 8 files changed, 147 insertions(+), 84 deletions(-) diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c index 60cdeed1c..e9ad45fa0 100644 --- a/kernel/generic/trmm_ltcopy_2.c +++ b/kernel/generic/trmm_ltcopy_2.c @@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X > posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else -#ifdef UNIT if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; } else { +#ifdef UNIT + data02 = *(ao1 + 1); b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; } -#endif - b[ 1] = *(ao1 + 1); - b += 2; } posY += 2; @@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c index 12642e7db..b83989f55 100644 --- a/kernel/generic/trmm_utcopy_16.c +++ b/kernel/generic/trmm_utcopy_16.c @@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 15); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; @@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a13 += i; a14 += i; a15 += i; - a16 += i; */ + a16 += i; b += 16 * i; } else if (X > posY) { @@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; - a08 += i; */ + a08 += i; b += 8 * i; } else if (X > posY) { @@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 8; } - /* a02 += i * lda; + a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; - a08 += i * lda; */ + a08 += i * lda; } else { #ifdef UNIT b[ 0] = ONE; @@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; - a04 += i; */ + a04 += i; b += 4 * i; } else if (X > posY) { @@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += lda; b += 4; } - /* a02 += lda; + a02 += lda; a03 += lda; - a04 += lda; */ + a04 += lda; } else { #ifdef UNIT @@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { a01 ++; a02 ++; - } else { -#ifdef UNIT + b += 2; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; } else { +#ifdef UNIT b[ 0] = ONE; - } + b[ 1] = *(a01 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); #endif - b[ 1] = *(a01 + 1); - } - b += 2; + b += 2; + } } posY += 2; } @@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i > 0) { do { if (X < posY) { - a01 ++; - } else { -#ifdef UNIT + a01 += 1; + b ++; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + a01 += lda; + b ++; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + b[ 0] = *(a01 + 0); #endif - a01 += lda; - } - b ++; - X ++; - i --; + a01 += lda; + b ++; + } + + X += 1; + i --; } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c index 75076c382..ae4a19e32 100644 --- a/kernel/generic/trmm_utcopy_2.c +++ b/kernel/generic/trmm_utcopy_2.c @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X < posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else if (X > posY) { @@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = ZERO; #endif - // ao1 += lda; + ao1 += lda; b += 2; } } @@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (m > 0) { do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { #ifdef UNIT - if (X > posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT - } else { - b[ 0] = ONE; - } + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - b ++; - ao1 += lda; - X ++; + b += 1; + ao1 += lda; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c index e5844094e..441f7338b 100644 --- a/kernel/generic/trmm_utcopy_4.c +++ b/kernel/generic/trmm_utcopy_4.c @@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { if (m & 2) { - /* ao1 += 2; + ao1 += 2; ao2 += 2; ao3 += 2; - ao4 += 2; */ + ao4 += 2; b += 8; } if (m & 1) { - /* ao1 += 1; + ao1 += 1; ao2 += 1; ao3 += 1; - ao4 += 1; */ + ao4 += 1; b += 4; } @@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data08; ao1 += 2 * lda; - // ao2 += 2 * lda; + ao2 += 2 * lda; b += 8; } @@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = data03; b[ 3] = data04; - // ao1 += lda; + ao1 += lda; b += 4; } @@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i) { if (X < posY) { - // ao1 += 2; + ao1 += 2; b += 2; } else if (X > posY) { @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { + b += 1; ao1 += 1; - } else { -#ifdef UNIT + } else if (X > posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - } - b ++; - X ++; + ao1 += lda; + b += 1; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c index 07bb137d4..12043eb33 100644 --- a/kernel/generic/trsm_ltcopy_4.c +++ b/kernel/generic/trsm_ltcopy_4.c @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } a1 += 2 * lda; - // a2 += 2 * lda; + a2 += 2 * lda; b += 8; ii += 2; diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c index 7969f4f3d..457890ceb 100644 --- a/kernel/generic/ztrmm_ltcopy_2.c +++ b/kernel/generic/ztrmm_ltcopy_2.c @@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { -#ifdef UNIT + + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); - b[ 1] = *(ao1 + 1); -#ifdef UNIT + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + b[ 0] = ONE; b[ 1] = ZERO; - } + b[ 2] = data3; + b[ 3] = data4; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; #endif - b += 4; + b += 4; + } } posY += 2; @@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 0e33a7d18..08f85e891 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0; + FLOAT data01, data02; FLOAT *a1; lda *= 2; diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index c34d741ee..387bb2532 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0, data03, data04; - FLOAT data05, data06, data07 = 0.0, data08 = 0.0; + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2; From 95f7f0229cf277d111206ae6841769d578e45580 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 May 2018 18:43:59 +0200 Subject: [PATCH 040/935] Remove extraneous brace from previous commit --- kernel/mips/dot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/dot.c b/kernel/mips/dot.c index cbd3efc64..89c9f80f6 100644 --- a/kernel/mips/dot.c +++ b/kernel/mips/dot.c @@ -42,7 +42,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) while(i < n) { #if defined(DSDOT) - dot += (double)(y[iy] * (double)x[ix] ; + dot += (double)y[iy] * (double)x[ix] ; #else dot += y[iy] * x[ix]; #endif From 82012b960b1c9427957bd87cc53f860823eeb674 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 May 2018 20:30:03 +0200 Subject: [PATCH 041/935] Revert " Switch mips32 target to USE_TRMM to fix complex TRMM" ... as it was just a silly workaround for the issue seen in #1563, caused by #1419 --- kernel/Makefile.L3 | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4d2999b67..066426396 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -20,10 +20,6 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif -ifeq ($(ARCH), mips) -USE_TRMM = 1 -endif - ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif From 961d25e9c7e4a1758adb1dbeaa15187de69dd052 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 May 2018 22:54:39 +0200 Subject: [PATCH 042/935] Use the new zrot.c on POWER8 for crot as well fixes #1571 (the old zrot.S assembly does not handle incx=0 correctly) --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 00ff8682a..1aa061078 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = ../arm/zrot.c +CROTKERNEL = zrot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c From 43e592ceb38a56716279a6514ceca1ec9bdb0865 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 24 May 2018 20:56:24 +0800 Subject: [PATCH 043/935] Add -lm for Android. Conflicts: exports/Makefile --- exports/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 53d4f75bb..127b05057 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -128,6 +128,8 @@ so : ../$(LIBSONAME) ifeq ($(OSNAME), Android) INTERNALNAME = $(LIBPREFIX).so +FEXTRALIB += -lm +EXTRALIB += -lm else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif From 908d40be715bfb252972a0a4abf27726a729945f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 29 May 2018 14:27:46 +0200 Subject: [PATCH 044/935] Adapt lapack-test and blas-test to changes in netlib directory layout partial fix for #1574 - the problem with lapack_testing.py looks like an upstream bug --- Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index c0e5fbcf8..380ba1ce8 100644 --- a/Makefile +++ b/Makefile @@ -294,9 +294,10 @@ endif lapack-test : (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc ifneq ($(CROSS), 1) - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) endif @@ -308,9 +309,9 @@ lapack-runtest: blas-test: - (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing - (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) dummy : From a7dbd4c57d22b580b32f3a97b0b327bf2fedf551 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 11:19:33 +0200 Subject: [PATCH 045/935] Fix paths to LIN and EIG tests should fix 1574 --- lapack-netlib/lapack_testing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 3c917482d..5d07e1e87 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -257,16 +257,16 @@ for dtype in range_prec: else: if dtest==16: # LIN TESTS - cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==17: # PROTO LIN TESTS - cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==18: # PROTO LIN TESTS - cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS - cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" if (not just_errors and not short_summary): print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing From 5fae96fb70cbc1205e50220f77722ac5ff92f0d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:43:45 +0200 Subject: [PATCH 046/935] Update version to 0.3.1.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b5789119a..f49f20513 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 0.dev) +set(OpenBLAS_PATCH_VERSION 1.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From b491b10057196c5735a261608ec110b1bbd134d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:44:36 +0200 Subject: [PATCH 047/935] Update version to 0.3.1.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 12734464b..1b4b8eb63 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.0.dev +VERSION = 0.3.1.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d1b7be14aa9b57ca4df9c00cdb4611974729b3be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:52:04 +0200 Subject: [PATCH 048/935] Handle INCX=0,INCY=0 case Fixes #1575 (sswap/dswap failing the swap utest on x86) as suggested by atsampson. --- kernel/x86/swap.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S index 54b00b33e..d3cf04942 100644 --- a/kernel/x86/swap.S +++ b/kernel/x86/swap.S @@ -138,6 +138,14 @@ /* INCX != 1 or INCY != 1 */ .L14: + cmpl $0, %ebx + jne .L141 + cmpl $0, %ecx + jne .L141 +/* INCX == 0 and INCY == 0 */ + jmp .L27 + +.L141 movl %edx, %eax sarl $2, %eax jle .L28 From a91f1587b9be6c9bbc403a79970d3e2a03bf866c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 13:26:00 +0200 Subject: [PATCH 049/935] Work around name clash with Windows10's winnt.h fixes #1503 --- driver/level3/Makefile | 48 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 352225206..e320092e3 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) From 2fc748bf7200ca53d66d43107dc2c732685519d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 13:41:12 +0200 Subject: [PATCH 050/935] Restore optimized swap kernel now that we have a proper fix --- kernel/x86/KERNEL.NEHALEM | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86/KERNEL.NEHALEM b/kernel/x86/KERNEL.NEHALEM index 835520efb..65b03ae50 100644 --- a/kernel/x86/KERNEL.NEHALEM +++ b/kernel/x86/KERNEL.NEHALEM @@ -1,3 +1 @@ include $(KERNELDIR)/KERNEL.PENRYN -SSWAPKERNEL = ../arm/swap.c -DSWAPKERNEL = ../arm/swap.c From 7df8c4f76fa7aadd8d1bce1d99fe826a4826d775 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 17:23:08 +0200 Subject: [PATCH 051/935] typo fix --- kernel/x86/swap.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S index d3cf04942..e30c27898 100644 --- a/kernel/x86/swap.S +++ b/kernel/x86/swap.S @@ -145,7 +145,7 @@ /* INCX == 0 and INCY == 0 */ jmp .L27 -.L141 +.L141: movl %edx, %eax sarl $2, %eax jle .L28 From e2a8c35e5a6897e5aebf5e2fb8ba18f94735c89a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:08:14 +0200 Subject: [PATCH 052/935] Fixes from netlib PR253 LAPACKE interfaces for Aasen's functions now call ?sytrf_aa and ?hetrf_aa instead of ?sytrf and ?hetrf --- lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c index b4a7595d8..e4d538779 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_chetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_chetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_chetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c index d4f24142b..f6661c85c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_csytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_csytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_csy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_csytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c index cbf97b632..e72bfa6de 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c @@ -40,7 +40,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_dsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -55,7 +55,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_dsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -67,7 +67,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_dsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c index d68cb17c1..182946a45 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c @@ -40,7 +40,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_ssytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -55,7 +55,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_ssytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -67,7 +67,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_ssytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c index 5214217fb..dbad2d81e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zhetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zhetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_zhetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c index 29d75319e..03726c63e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_zsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_zsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } From 677e42d7b0c6b6c40af94268fbb9d9be60f7af0a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:12:59 +0200 Subject: [PATCH 053/935] Fixes from netlib PR 253 When minimal workspace is given in ?hesv_aa, ?sysv_aa, ?hesv_aa_2stage, ?sysv_aa_2stage, now no error is given Quick return for ?laqr1 --- lapack-netlib/SRC/cgejsv.f | 4 ++-- lapack-netlib/SRC/chesv_aa.f | 5 ++--- lapack-netlib/SRC/chesv_aa_2stage.f | 15 +++++++++------ lapack-netlib/SRC/chetrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/chetrs_aa_2stage.f | 1 + lapack-netlib/SRC/cla_syamv.f | 2 +- lapack-netlib/SRC/claqr1.f | 7 +++++++ lapack-netlib/SRC/csysv_aa.f | 3 --- lapack-netlib/SRC/csysv_aa_2stage.f | 15 +++++++++------ lapack-netlib/SRC/csytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/csytri2.f | 6 +++--- lapack-netlib/SRC/csytrs_aa_2stage.f | 1 + lapack-netlib/SRC/ctrevc3.f | 18 +++++++++--------- lapack-netlib/SRC/dgelqt.f | 2 +- lapack-netlib/SRC/dla_syamv.f | 2 +- lapack-netlib/SRC/dlaqr1.f | 7 +++++++ lapack-netlib/SRC/dsysv_aa.f | 3 --- lapack-netlib/SRC/dsysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/dsytrf_aa_2stage.f | 8 ++++++-- lapack-netlib/SRC/dsytri2.f | 6 +++--- lapack-netlib/SRC/dsytrs_aa_2stage.f | 1 + lapack-netlib/SRC/dtrevc3.f | 4 ++-- lapack-netlib/SRC/iparmq.f | 4 ++-- lapack-netlib/SRC/sla_syamv.f | 2 +- lapack-netlib/SRC/slaqr1.f | 7 +++++++ lapack-netlib/SRC/ssysv_aa.f | 3 --- lapack-netlib/SRC/ssysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/ssytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/ssytri2.f | 4 ++-- lapack-netlib/SRC/ssytrs_aa_2stage.f | 1 + lapack-netlib/SRC/strevc3.f | 12 ++++++------ lapack-netlib/SRC/zgejsv.f | 4 ++-- lapack-netlib/SRC/zhesv_aa.f | 5 ++--- lapack-netlib/SRC/zhesv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/zhetrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/zhetrs_aa_2stage.f | 7 ++++--- lapack-netlib/SRC/zla_syamv.f | 2 +- lapack-netlib/SRC/zlaqr1.f | 7 +++++++ lapack-netlib/SRC/zsysv_aa.f | 3 --- lapack-netlib/SRC/zsysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/zsytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/zsytri2.f | 2 +- lapack-netlib/SRC/zsytrs_aa_2stage.f | 1 + 43 files changed, 155 insertions(+), 101 deletions(-) diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 8eb43cf50..a7b1c451c 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -701,7 +701,7 @@ LWSVDJ = MAX( 2 * N, 1 ) LWSVDJV = MAX( 2 * N, 1 ) * .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ - LRWQP3 = N + LRWQP3 = 2 * N LRWCON = N LRWSVDJ = N IF ( LQUERY ) THEN @@ -939,7 +939,7 @@ END IF END IF MINWRK = MAX( 2, MINWRK ) - OPTWRK = MAX( 2, OPTWRK ) + OPTWRK = MAX( OPTWRK, MINWRK ) IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 END IF diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 0bf636b48..470f910bc 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -209,6 +209,8 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 + ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + INFO = -10 END IF * IF( INFO.EQ.0 ) THEN @@ -219,9 +221,6 @@ LWKOPT_HETRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 057d9c57a..05f6b7bb7 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN @@ -270,6 +271,8 @@ END IF * WORK( 1 ) = LWKOPT +* + RETURN * * End of CHESV_AA_2STAGE * diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 0fa2ae3a0..ce34d73cc 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -658,6 +660,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of CHETRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/chetrs_aa_2stage.f b/lapack-netlib/SRC/chetrs_aa_2stage.f index 3f8576673..05d09275b 100644 --- a/lapack-netlib/SRC/chetrs_aa_2stage.f +++ b/lapack-netlib/SRC/chetrs_aa_2stage.f @@ -87,6 +87,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/cla_syamv.f b/lapack-netlib/SRC/cla_syamv.f index e1d3df960..695b5e478 100644 --- a/lapack-netlib/SRC/cla_syamv.f +++ b/lapack-netlib/SRC/cla_syamv.f @@ -241,7 +241,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'SSYMV ', INFO ) + CALL XERBLA( 'CLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/claqr1.f b/lapack-netlib/SRC/claqr1.f index b76bedf60..977947196 100644 --- a/lapack-netlib/SRC/claqr1.f +++ b/lapack-netlib/SRC/claqr1.f @@ -142,6 +142,13 @@ CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) ) IF( S.EQ.RZERO ) THEN diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f index 9cd669d33..87be734cc 100644 --- a/lapack-netlib/SRC/csysv_aa.f +++ b/lapack-netlib/SRC/csysv_aa.f @@ -221,9 +221,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f index cba57fc3e..a13349824 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.f +++ b/lapack-netlib/SRC/csysv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN @@ -270,6 +271,8 @@ END IF * WORK( 1 ) = LWKOPT +* + RETURN * * End of CSYSV_AA_2STAGE * diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f index 0a6bfbe31..0d0bd156c 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.f +++ b/lapack-netlib/SRC/csytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -662,6 +664,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of CSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/csytri2.f b/lapack-netlib/SRC/csytri2.f index 4c6baaa3e..4bd8e4f99 100644 --- a/lapack-netlib/SRC/csytri2.f +++ b/lapack-netlib/SRC/csytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -163,7 +163,7 @@ UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'CSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/csytrs_aa_2stage.f b/lapack-netlib/SRC/csytrs_aa_2stage.f index 03bccda82..d025c08fe 100644 --- a/lapack-netlib/SRC/csytrs_aa_2stage.f +++ b/lapack-netlib/SRC/csytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f index c06b40477..a134c1a50 100644 --- a/lapack-netlib/SRC/ctrevc3.f +++ b/lapack-netlib/SRC/ctrevc3.f @@ -27,8 +27,8 @@ * .. * .. Array Arguments .. * LOGICAL SELECT( * ) -* REAL RWORK( * ) -* COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), +* REAL RWORK( * ) +* COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), * $ WORK( * ) * .. * @@ -258,17 +258,17 @@ * .. * .. Array Arguments .. LOGICAL SELECT( * ) - REAL RWORK( * ) - COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), + REAL RWORK( * ) + COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), $ WORK( * ) * .. * * ===================================================================== * * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) - COMPLEX CZERO, CONE + COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER NBMIN, NBMAX @@ -277,13 +277,13 @@ * .. Local Scalars .. LOGICAL ALLV, BOTHV, LEFTV, LQUERY, OVER, RIGHTV, SOMEV INTEGER I, II, IS, J, K, KI, IV, MAXWRK, NB - REAL OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL - COMPLEX CDUM + REAL OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL + COMPLEX CDUM * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ICAMAX - REAL SLAMCH, SCASUM + REAL SLAMCH, SCASUM EXTERNAL LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM * .. * .. External Subroutines .. diff --git a/lapack-netlib/SRC/dgelqt.f b/lapack-netlib/SRC/dgelqt.f index 2124f3dc3..5b4ee65b5 100644 --- a/lapack-netlib/SRC/dgelqt.f +++ b/lapack-netlib/SRC/dgelqt.f @@ -158,7 +158,7 @@ INTEGER I, IB, IINFO, K * .. * .. External Subroutines .. - EXTERNAL DGEQRT2, DGELQT3, DGEQRT3, DLARFB, XERBLA + EXTERNAL DGELQT3, DLARFB, XERBLA * .. * .. Executable Statements .. * diff --git a/lapack-netlib/SRC/dla_syamv.f b/lapack-netlib/SRC/dla_syamv.f index 29566a6e9..bb6dbe288 100644 --- a/lapack-netlib/SRC/dla_syamv.f +++ b/lapack-netlib/SRC/dla_syamv.f @@ -230,7 +230,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'DSYMV ', INFO ) + CALL XERBLA( 'DLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/dlaqr1.f b/lapack-netlib/SRC/dlaqr1.f index 81a462fb3..795b072ab 100644 --- a/lapack-netlib/SRC/dlaqr1.f +++ b/lapack-netlib/SRC/dlaqr1.f @@ -147,6 +147,13 @@ INTRINSIC ABS * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) ) IF( S.EQ.ZERO ) THEN diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index cbccd5e65..7192928c6 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -221,9 +221,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index ac3c77d76..05e538f0b 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -107,6 +107,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -126,7 +127,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -152,6 +153,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -235,19 +237,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index f5f06cc1d..25fc1a2eb 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -109,6 +110,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -128,10 +130,10 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the -*> row and column IPIV(k). +*> row and column IPIV2(k). *> \endverbatim *> *> \param[out] INFO @@ -641,6 +643,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL DGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of DSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index 9aa21a854..23f8b9fa2 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -163,7 +163,7 @@ UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/dsytrs_aa_2stage.f b/lapack-netlib/SRC/dsytrs_aa_2stage.f index caff5d4ad..bb283cb95 100644 --- a/lapack-netlib/SRC/dsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f index 745f636d0..957baf4f0 100644 --- a/lapack-netlib/SRC/dtrevc3.f +++ b/lapack-netlib/SRC/dtrevc3.f @@ -45,9 +45,9 @@ *> The right eigenvector x and the left eigenvector y of T corresponding *> to an eigenvalue w are defined by: *> -*> T*x = w*x, (y**H)*T = w*(y**H) +*> T*x = w*x, (y**T)*T = w*(y**T) *> -*> where y**H denotes the conjugate transpose of y. +*> where y**T denotes the transpose of the vector y. *> The eigenvalues are not input to this routine, but are read directly *> from the diagonal blocks of T. *> diff --git a/lapack-netlib/SRC/iparmq.f b/lapack-netlib/SRC/iparmq.f index e576e0db0..a9212b3e0 100644 --- a/lapack-netlib/SRC/iparmq.f +++ b/lapack-netlib/SRC/iparmq.f @@ -104,13 +104,13 @@ *> *> \param[in] NAME *> \verbatim -*> NAME is character string +*> NAME is CHARACTER string *> Name of the calling subroutine *> \endverbatim *> *> \param[in] OPTS *> \verbatim -*> OPTS is character string +*> OPTS is CHARACTER string *> This is a concatenation of the string arguments to *> TTQRE. *> \endverbatim diff --git a/lapack-netlib/SRC/sla_syamv.f b/lapack-netlib/SRC/sla_syamv.f index d40e7bd95..4459f4d8b 100644 --- a/lapack-netlib/SRC/sla_syamv.f +++ b/lapack-netlib/SRC/sla_syamv.f @@ -230,7 +230,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'SSYMV ', INFO ) + CALL XERBLA( 'SLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/slaqr1.f b/lapack-netlib/SRC/slaqr1.f index 7d7d851ee..2de33849d 100644 --- a/lapack-netlib/SRC/slaqr1.f +++ b/lapack-netlib/SRC/slaqr1.f @@ -147,6 +147,13 @@ INTRINSIC ABS * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) ) IF( S.EQ.ZERO ) THEN diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index abf52b143..e470f5883 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -220,9 +220,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index a738c7415..43d937141 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -106,6 +106,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -125,7 +126,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -151,6 +152,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -234,19 +236,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index a92974930..0e0f6edb7 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -641,6 +643,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL SGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of SSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 97b539005..4b9ea4e7b 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO diff --git a/lapack-netlib/SRC/ssytrs_aa_2stage.f b/lapack-netlib/SRC/ssytrs_aa_2stage.f index c9c7181f2..d271b9481 100644 --- a/lapack-netlib/SRC/ssytrs_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f index 0df1189f0..525978071 100644 --- a/lapack-netlib/SRC/strevc3.f +++ b/lapack-netlib/SRC/strevc3.f @@ -27,7 +27,7 @@ * .. * .. Array Arguments .. * LOGICAL SELECT( * ) -* REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), +* REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), * $ WORK( * ) * .. * @@ -45,9 +45,9 @@ *> The right eigenvector x and the left eigenvector y of T corresponding *> to an eigenvalue w are defined by: *> -*> T*x = w*x, (y**H)*T = w*(y**H) +*> T*x = w*x, (y**T)*T = w*(y**T) *> -*> where y**H denotes the conjugate transpose of y. +*> where y**T denotes the transpose of the vector y. *> The eigenvalues are not input to this routine, but are read directly *> from the diagonal blocks of T. *> @@ -251,14 +251,14 @@ * .. * .. Array Arguments .. LOGICAL SELECT( * ) - REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), + REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), $ WORK( * ) * .. * * ===================================================================== * * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) INTEGER NBMIN, NBMAX PARAMETER ( NBMIN = 8, NBMAX = 128 ) @@ -268,7 +268,7 @@ $ RIGHTV, SOMEV INTEGER I, IERR, II, IP, IS, J, J1, J2, JNXT, K, KI, $ IV, MAXWRK, NB, KI2 - REAL BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE, + REAL BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE, $ SMIN, SMLNUM, ULP, UNFL, VCRIT, VMAX, WI, WR, $ XNORM * .. diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index e8418c680..d553da90b 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -704,7 +704,7 @@ LWSVDJ = MAX( 2 * N, 1 ) LWSVDJV = MAX( 2 * N, 1 ) * .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ - LRWQP3 = N + LRWQP3 = 2 * N LRWCON = N LRWSVDJ = N IF ( LQUERY ) THEN @@ -942,7 +942,7 @@ END IF END IF MINWRK = MAX( 2, MINWRK ) - OPTWRK = MAX( 2, OPTWRK ) + OPTWRK = MAX( MINWRK, OPTWRK ) IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 END IF diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index bbd0fdff4..8511f0e7d 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -209,6 +209,8 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 + ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + INFO = -10 END IF * IF( INFO.EQ.0 ) THEN @@ -219,9 +221,6 @@ LWKOPT_HETRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index a34440029..ed221dc69 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -106,6 +106,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -125,7 +126,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -151,6 +152,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -240,19 +242,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 4d62198d6..73c0ebe9a 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -657,6 +659,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of ZHETRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/zhetrs_aa_2stage.f b/lapack-netlib/SRC/zhetrs_aa_2stage.f index 02e17476f..7fcee1118 100644 --- a/lapack-netlib/SRC/zhetrs_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrs_aa_2stage.f @@ -69,7 +69,7 @@ *> *> \param[in] A *> \verbatim -*> A is COMPLEX*16array, dimension (LDA,N) +*> A is COMPLEX*16 array, dimension (LDA,N) *> Details of factors computed by ZHETRF_AA_2STAGE. *> \endverbatim *> @@ -81,12 +81,13 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (LTB) *> Details of factors computed by ZHETRF_AA_2STAGE. *> \endverbatim *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> @@ -106,7 +107,7 @@ *> *> \param[in,out] B *> \verbatim -*> B is COMPLEX*16array, dimension (LDB,NRHS) +*> B is COMPLEX*16 array, dimension (LDB,NRHS) *> On entry, the right hand side matrix B. *> On exit, the solution matrix X. *> \endverbatim diff --git a/lapack-netlib/SRC/zla_syamv.f b/lapack-netlib/SRC/zla_syamv.f index 02958bef3..cfdb3cdc8 100644 --- a/lapack-netlib/SRC/zla_syamv.f +++ b/lapack-netlib/SRC/zla_syamv.f @@ -241,7 +241,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'DSYMV ', INFO ) + CALL XERBLA( 'ZLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/zlaqr1.f b/lapack-netlib/SRC/zlaqr1.f index 03afb87aa..34341cb10 100644 --- a/lapack-netlib/SRC/zlaqr1.f +++ b/lapack-netlib/SRC/zlaqr1.f @@ -142,6 +142,13 @@ CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) ) IF( S.EQ.RZERO ) THEN diff --git a/lapack-netlib/SRC/zsysv_aa.f b/lapack-netlib/SRC/zsysv_aa.f index 10693c731..325d07c54 100644 --- a/lapack-netlib/SRC/zsysv_aa.f +++ b/lapack-netlib/SRC/zsysv_aa.f @@ -221,9 +221,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f index fcf9bc870..029ed587d 100644 --- a/lapack-netlib/SRC/zsysv_aa_2stage.f +++ b/lapack-netlib/SRC/zsysv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f index 1f916726e..d3486c1a7 100644 --- a/lapack-netlib/SRC/zsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -662,6 +664,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of ZSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/zsytri2.f b/lapack-netlib/SRC/zsytri2.f index d5aabd43a..e7303c90b 100644 --- a/lapack-netlib/SRC/zsytri2.f +++ b/lapack-netlib/SRC/zsytri2.f @@ -163,7 +163,7 @@ UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'ZSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/zsytrs_aa_2stage.f b/lapack-netlib/SRC/zsytrs_aa_2stage.f index c5d894753..fa15eee90 100644 --- a/lapack-netlib/SRC/zsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> From c5b13d4e10d38eb1bad56aac21bc9ffcf0b577df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:14:45 +0200 Subject: [PATCH 054/935] Fixes from netlib PR 253 --- lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index 5698bcf94..f6d990d1c 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -218,7 +218,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DERRSY, DLACPY, DLARHS, - $ DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE + $ DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE, $ DSYTRS_AA_2STAGE, XLAENV * .. * .. Intrinsic Functions .. diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 0be321eb0..898422654 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -204,7 +204,7 @@ * .. External Subroutines .. EXTERNAL ALADHD, ALAERH, ALASVM, XLAENV, DERRVX, $ DGET04, DLACPY, DLARHS, DLATB4, DLATMS, - $ DSYSV_AA_2STAGE, CHET01_AA, DPOT02, + $ DSYSV_AA_2STAGE, DPOT02, $ DSYTRF_AA_2STAGE * .. * .. Scalars in Common .. diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index d8d9dc0a9..70e8ff6b8 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -203,7 +203,7 @@ * .. * .. External Subroutines .. EXTERNAL ALADHD, ALAERH, ALASVM, XLAENV, SERRVX, - $ CGET04, SLACPY, SLARHS, SLATB4, SLATMS, + $ SLACPY, SLARHS, SLATB4, SLATMS, $ SSYSV_AA_2STAGE, SSYT01_AA, SPOT02, $ SSYTRF_AA_2STAGE * .. diff --git a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f index d4d8c2939..87fc47f71 100644 --- a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f @@ -217,8 +217,8 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, CERRSY, ZLACPY, ZLARHS, - $ CLATB4, ZLATMS, ZSYT02, ZSYT01, + EXTERNAL ALAERH, ALAHD, ALASUM, ZERRSY, ZLACPY, ZLARHS, + $ ZLATB4, ZLATMS, ZSYT02, ZSYT01, $ ZSYTRF_AA_2STAGE, ZSYTRS_AA_2STAGE, $ XLAENV * .. From a8002e283a5874946bb464a45045d4651081e675 Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Fri, 1 Jun 2018 23:20:00 +0100 Subject: [PATCH 055/935] Revert "take out unused variables" This reverts commit e5752ff9b322c665a7393d6109c2da7ad6ee2523. The variables i and n are used in the `#if !__GLIBC_PREREQ(2, 7)` branch. Closes gh-1586. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ef328b945..d69e52e97 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -180,7 +180,7 @@ int get_num_procs(void) { cpu_set_t *cpusetp; size_t size; int ret; -// int i,n; +int i,n; if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) From 99c7bba8e404fcf697f00bc986e106892eff47ad Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 07:24:29 +0000 Subject: [PATCH 056/935] Initial support for SkylakeX / AVX512 This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server) target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set, which brings 2 basic things: 1) 512 bit wide SIMD (2x width of AVX2) 2) 32 SIMD registers (2x the number on AVX2) This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel to AVX512VL; more will follow later but this patch aims to get the infrastructure in place for this "later". Full performance tuning has not been done yet; with more registers and wider SIMD it's in theory possible to retune the kernels but even without that there's an interesting enough performance increase (30-40% range) with just this change. --- Makefile.system | 8 +- TargetList.txt | 1 + cmake/arch.cmake | 3 + cmake/system.cmake | 2 +- cpuid.h | 3 + cpuid_x86.c | 2 + driver/others/dynamic.c | 2 + driver/others/parameter.c | 4 +- getarch.c | 15 + kernel/CMakeLists.txt | 2 +- kernel/Makefile.L3 | 4 + kernel/setparam-ref.c | 16 + kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SKYLAKEX | 4 + kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 6812 ++++++++++++++++++++ kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 119 + 57 files changed, 7034 insertions(+), 47 deletions(-) create mode 100644 kernel/x86_64/KERNEL.SKYLAKEX create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.S diff --git a/Makefile.system b/Makefile.system index 7bfac1fa8..b005b80c9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -62,6 +62,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -95,6 +98,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET_CORE), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -467,7 +473,7 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN +DYNAMIC_CORE += HASWELL ZEN SKYLAKEX endif endif diff --git a/TargetList.txt b/TargetList.txt index aeeaa9ede..31e4881c4 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -20,6 +20,7 @@ DUNNINGTON NEHALEM SANDYBRIDGE HASWELL +SKYLAKEX ATOM b)AMD CPU: diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 798a9ef82..527d2bec6 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -56,6 +56,9 @@ if (DYNAMIC_ARCH) if (NOT NO_AVX2) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () + if (NOT NO_AVX512) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) + endif () endif () if (NOT DYNAMIC_CORE) diff --git a/cmake/system.cmake b/cmake/system.cmake index 645895671..c21fe7c14 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") diff --git a/cpuid.h b/cpuid.h index 1dacc49ba..a6bc211f3 100644 --- a/cpuid.h +++ b/cpuid.h @@ -115,6 +115,7 @@ #define CORE_STEAMROLLER 25 #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 +#define CORE_SKYLAKEX 28 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -137,6 +138,7 @@ #define HAVE_AVX (1 << 18) #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) +#define HAVE_AVX512VL (1 << 21) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -211,5 +213,6 @@ typedef struct { #define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 +#define CPUTYPE_SKYLAKEX 52 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 342c56525..5f49e7715 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -50,6 +50,8 @@ #ifdef NO_AVX #define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CORE_HASWELL CORE_NEHALEM +#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM +#define CORE_SKYLAKEX CORE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index fbf7cd40e..a0c9794b1 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -74,6 +74,7 @@ extern gotoblas_t gotoblas_STEAMROLLER; extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; @@ -83,6 +84,7 @@ extern gotoblas_t gotoblas_ZEN; //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM +#define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 31a48644f..e7332c0c4 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -167,7 +167,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 992fc2b95..fcffe63e2 100644 --- a/getarch.c +++ b/getarch.c @@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "HASWELL" #endif +#ifdef FORCE_SKYLAKEX +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SKYLAKEX" +#define ARCHCONFIG "-DSKYLAKEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" +#define LIBNAME "skylakex" +#define CORENAME "SKYLAKEX" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index c06d1eae8..947114ebe 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") + if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") set(USE_TRMM true) endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 066426396..b37e536ef 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif +ifeq ($(CORE), SKYLAKEX) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b6c5b54de..9030d7c6d 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -871,6 +871,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SKYLAKEX + +#ifdef DEBUG + fprintf(stderr, "SkylakeX\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 0b475afa2..34653d400 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index e98854f34..492f34344 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 086852cfc..6840c54ad 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 2dd8ad08b..361ccf603 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 154276f6a..11825429e 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index acdcd6e22..4c054f399 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index da561b583..e67496736 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index a11b0286a..498057697 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 787ab5982..f3072983d 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 9a3b0cbd7..879ae9c38 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index bd7a78b5a..6c308197b 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX new file mode 100644 index 000000000..744831d67 --- /dev/null +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -0,0 +1,4 @@ +include $(KERNELDIR)/KERNEL.HASWELL + +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index b1ec19bd3..586d05ac2 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 5f01f7eeb..93fca0a0d 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 770c955b2..d81766cd4 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index d75e58fdd..6bdea6787 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 9b9179da0..72af99809 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 4bde62824..b4acdccd2 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "daxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 8162a5d83..059549028 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 1b9ca7a60..309fbe767 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 6b99d6fdd..a7478e3a8 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 428558617..2c7b3b17c 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dscal_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 3e8db3fa3..73099462c 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 61cb77a64..431e4bb3f 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89fe408a..d89c4070d 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index b6f3c21af..c3ab2ffe6 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S new file mode 100644 index 000000000..1fab892ca --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -0,0 +1,6812 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm2 + vbroadcastss -1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm2 + vbroadcastss 1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm12,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm3,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro KERNEL16x6_SUB4 + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm7 + vbroadcastss -1 * SIZE(BO), %zmm9 + VFMADD231PS_( %zmm8,%zmm7,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm9,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm11 + vbroadcastss 1 * SIZE(BO), %zmm13 + VFMADD231PS_( %zmm12,%zmm11,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm13,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm16 + vbroadcastss -3 * SIZE(BO), %zmm17 + + VFMADD231PS_( %zmm4,%zmm16,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm17,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm18 + vbroadcastss -1 * SIZE(BO), %zmm19 + VFMADD231PS_( %zmm8,%zmm18,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm19,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm20 + vbroadcastss 1 * SIZE(BO), %zmm21 + VFMADD231PS_( %zmm12,%zmm20,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm21,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm22 + vbroadcastss -3 * SIZE(BO), %zmm23 + + VFMADD231PS_( %zmm4,%zmm22,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm23,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm24 + vbroadcastss -1 * SIZE(BO), %zmm25 + VFMADD231PS_( %zmm8,%zmm24,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm25,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm26 + vbroadcastss 1 * SIZE(BO), %zmm27 + VFMADD231PS_( %zmm12,%zmm26,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm27,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm28 + vbroadcastss -3 * SIZE(BO), %zmm29 + + VFMADD231PS_( %zmm4,%zmm28,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm29,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm30 + vbroadcastss -1 * SIZE(BO), %zmm31 + VFMADD231PS_( %zmm8,%zmm30,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm31,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm1 + vbroadcastss 1 * SIZE(BO), %zmm5 + VFMADD231PS_( %zmm12,%zmm1,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm5,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + vmulps %zmm0 , %zmm12, %zmm12 + vmulps %zmm0 , %zmm14, %zmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO1, LDC,2), %zmm8,%zmm8 + + vaddps (CO2), %zmm10,%zmm10 + + vaddps (CO2, LDC), %zmm12,%zmm12 + + vaddps (CO2, LDC,2), %zmm14,%zmm14 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO1, LDC,2) + + vmovups %zmm10, (CO2) + + vmovups %zmm12, (CO2, LDC) + + vmovups %zmm14, (CO2, LDC,2) + +.endm + + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO2), %zmm8,%zmm8 + + vaddps (CO2, LDC), %zmm10,%zmm10 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO2) + + vmovups %zmm10, (CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + +#endif + + vmovups %zmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index fd028964b..65305ac59 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index f04d461f7..065e5b385 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 199d8a517..73ae001ea 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 691a071f7..f37c251a1 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 8cae3fc1b..8a5c44c9b 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index d7091624d..0c40a3435 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 3549b9863..7a2eeace5 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 882b035a9..0408b577c 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 8cb1d532f..53866cf95 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index d11c76647..ef12569c8 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index f6f88155c..0fedc496b 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 3e4b7d5df..2ab7a671b 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index aa5d8fac0..2a6d0e4c7 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index dd95eea17..e44bd7550 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 75124cf3e..e9f330c36 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index db1a4ff5f..9f0dead18 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 599765a6d..b6106a37d 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 4227d548e..49a5e85e8 100644 --- a/param.h +++ b/param.h @@ -1613,6 +1613,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#endif + +#ifdef SKYLAKEX + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#ifdef WINDOWS_ABI +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 128 +#else +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#endif +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 8 +#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define ZGEMM3M_DEFAULT_UNROLL_N 8 +#define ZGEMM3M_DEFAULT_UNROLL_M 2 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif + + #endif From 00235157339dc5fba2b4194bd660c45257e539e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:22:59 +0200 Subject: [PATCH 057/935] Typo fix (misplaced parenthesis) --- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 361ccf603..e2f731fca 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif From f1fb9a474571846ffc140313dbe5b8ba21925b74 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:48:27 +0200 Subject: [PATCH 058/935] Propagate NO_AVX512 if needed --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index b005b80c9..cec4b44e5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -147,6 +147,10 @@ ifeq ($(NO_AVX2), 1) GETARCH_FLAGS += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +GETARCH_FLAGS += -DNO_AVX512 +endif + ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif From a7d0f49cec68dc3f116feed0320708ae004af4c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:13:25 +0200 Subject: [PATCH 059/935] Add SKYLAKEX to DYNAMIC_CORE list only if AVX512 is available --- Makefile.system | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index cec4b44e5..82e38a6d2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -477,7 +477,12 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN SKYLAKEX +DYNAMIC_CORE += HASWELL ZEN +endif +ifneq ($(NO_AVX512), 1) +ifneq ($(NO_AVX2), 1) +DYNAMIC_CORE += SKYLAKEX +endif endif endif From 5a92b311e05fb938e1fd85dcaf6fbeebc77bd4fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:29:07 +0200 Subject: [PATCH 060/935] Separate Skylake X from Skylake --- cpuid_x86.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 5f49e7715..d0dbe1d24 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1301,6 +1301,19 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) @@ -1558,6 +1571,7 @@ static char *cpuname[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *lowercpuname[] = { @@ -1612,6 +1626,7 @@ static char *lowercpuname[] = { "steamroller", "excavator", "zen", + "skylakex" }; static char *corename[] = { @@ -1643,6 +1658,7 @@ static char *corename[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *corename_lower[] = { @@ -1674,6 +1690,7 @@ static char *corename_lower[] = { "steamroller", "excavator", "zen", + "skylakex" }; @@ -1862,6 +1879,19 @@ int get_coretype(void){ else return CORE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if/support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) From 5a51cf4576df2e065e5517b04369ff10a2a83f58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:41:33 +0200 Subject: [PATCH 061/935] Separate Skylake X from Skylake --- driver/others/dynamic.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a0c9794b1..5e9a24b8b 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -79,6 +79,11 @@ extern gotoblas_t gotoblas_EXCAVATOR; #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; +#ifndef NO_AVX512 +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#endif #endif #else //Use NEHALEM kernels for sandy bridge @@ -286,8 +291,21 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + if (model == 5) { + // Intel Skylake X +#ifndef NO_AVX512 + return $gotoblas_SKYLAKEX; +#else + if(support_avx()) + return &gotoblas_HASWELL; + else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } +#endif //Intel Skylake - if (model == 14 || model == 5) { + if (model == 14) { if(support_avx()) return &gotoblas_HASWELL; else{ @@ -447,7 +465,8 @@ static char *corename[] = { "Haswell", "Steamroller", "Excavator", - "Zen" + "Zen", + "SkylakeX" }; char *gotoblas_corename(void) { @@ -475,7 +494,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_ZEN) return corename[23]; - + if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; return corename[0]; } @@ -505,6 +524,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 24: return (&gotoblas_SKYLAKEX); case 23: return (&gotoblas_ZEN); case 22: return (&gotoblas_EXCAVATOR); case 21: return (&gotoblas_STEAMROLLER); From 83fec56a3f55fa24b2e541549852bdee03d30a0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 00:01:11 +0200 Subject: [PATCH 062/935] Disable AVX512 (Skylake X) support if the build system is too old --- c_check | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/c_check b/c_check index a3b337602..dfe99350a 100644 --- a/c_check +++ b/c_check @@ -201,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); +$no_avx512= 0; +if (($architecture eq "x86") || ($architecture eq "x86_64")) { + $code = '"vaddps %zmm1, %zmm0, %zmm0"'; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + $args = " -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $no_avx512 = 1; + } else { + $no_avx512 = 0; + } + unlink("tmpf.o"); +} + $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -288,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; +print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; From ef626c6824c26415bc074d11325245e72f9e3284 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 00:13:19 +0200 Subject: [PATCH 063/935] typo fix --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5e9a24b8b..2c902d108 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -294,7 +294,7 @@ static gotoblas_t *get_coretype(void){ if (model == 5) { // Intel Skylake X #ifndef NO_AVX512 - return $gotoblas_SKYLAKEX; + return &gotoblas_SKYLAKEX; #else if(support_avx()) return &gotoblas_HASWELL; From 89372e0993b7d9fe9061797625713519392fa42b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 22:15:09 +0000 Subject: [PATCH 064/935] Use AVX512 also for DGEMM this required switching to the generic gemm_beta code (which is faster anyway on SKX) for both DGEMM and SGEMM Performance for the not-retuned version is in the 30% range --- kernel/x86_64/KERNEL.SKYLAKEX | 15 + kernel/x86_64/dgemm_kernel_16x2_skylakex.S | 5138 ++++++++++++++++++++ kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 3 +- 3 files changed, 5154 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.S diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 744831d67..c273ff8cd 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -2,3 +2,18 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + +DTRMMKERNEL = ../generic/trmmkernel_16x2.c +DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c \ No newline at end of file diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.S b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S new file mode 100644 index 000000000..91ac51280 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S @@ -0,0 +1,5138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 1024 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + vbroadcastsd -12 * SIZE(BO), %zmm1 + vbroadcastsd -11 * SIZE(BO), %zmm2 + vbroadcastsd -10 * SIZE(BO), %zmm3 + + vmovaps -16 * SIZE(AO), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + + vmovaps -8 * SIZE(AO), %zmm9 + VFMADD231PD_ %zmm10,%zmm1,%zmm9 + VFMADD231PD_ %zmm11,%zmm2,%zmm9 + VFMADD231PD_ %zmm12,%zmm3,%zmm9 + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %zmm2 +.endm + + + + +.macro KERNEL16x3_2 + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_3 + vmovups 0 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_4 + vmovups 16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + addq $12, BI + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $64, %rax +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %zmm0 + + vmulpd %zmm0 , %zmm4 , %zmm4 + vmulpd %zmm0 , %zmm10, %zmm10 + + vmulpd %zmm0 , %zmm5 , %zmm5 + vmulpd %zmm0 , %zmm11, %zmm11 + + vmulpd %zmm0 , %zmm6 , %zmm6 + vmulpd %zmm0 , %zmm12, %zmm12 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %zmm4,%zmm4 + vaddpd 8 * SIZE(CO1), %zmm10,%zmm10 + + vaddpd (CO1, LDC), %zmm5,%zmm5 + vaddpd 8 * SIZE(CO1, LDC), %zmm11,%zmm11 + + vaddpd (CO1, LDC, 2), %zmm6,%zmm6 + vaddpd 8 * SIZE(CO1, LDC, 2), %zmm12,%zmm12 + +#endif + + vmovups %zmm4 , (CO1) + vmovups %zmm10, 8 * SIZE(CO1) + + vmovups %zmm5 , (CO1, LDC) + vmovups %zmm11, 8 * SIZE(CO1, LDC) + + vmovups %zmm6 , (CO1, LDC, 2) + vmovups %zmm12, 8 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 1, BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S index 1fab892ca..ac4421252 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) +# prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) @@ -183,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) From ac7b6e3e9aeffe111a0ef23ba74ac2b181b87e30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 08:23:40 +0200 Subject: [PATCH 065/935] Fix misplaced endif --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2c902d108..ac1186c8f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -302,8 +302,8 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } +#endif } -#endif //Intel Skylake if (model == 14) { if(support_avx()) From 8be027e4c62460f373980e883c487a30a15b5a5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 14:36:39 +0200 Subject: [PATCH 066/935] Update dynamic.c --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ac1186c8f..96612cc52 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -82,7 +82,7 @@ extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; #else -#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#define gotoblas_SKYLAKEX gotoblas_HASWELL #endif #endif #else From dc9fe05ab5845452d684746bb7b7b7ad400c0c31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 17:10:19 +0200 Subject: [PATCH 067/935] Update cpuid_x86.c --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index d0dbe1d24..fc937865c 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1883,7 +1883,7 @@ int get_coretype(void){ #ifndef NO_AVX512 return CORE_SKYLAKEX; #else - if/support_avx()) + if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else From b7feded85acaf95d68ed4cfd573e60c83fdbca5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:24:05 +0200 Subject: [PATCH 068/935] Propagate NO_AVX512 via CCOMMON_OPT --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 82e38a6d2..8c875d6f7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -939,6 +939,10 @@ ifeq ($(NO_AVX2), 1) CCOMMON_OPT += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +CCOMMON_OPT += -DNO_AVX512 +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER From 38ad05bd0484ea723a42415f986cf0db24e01ca8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:26:49 +0200 Subject: [PATCH 069/935] Extend loop range to find SkylakeX in force_coretype --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 96612cc52..acb2d8b8c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -506,7 +506,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 23; i++) + for ( i=1 ; i <= 24; i++) { if (!strncasecmp(coretype,corename[i],20)) { From 354a976a59f1280c5403b8de37587baf53527b67 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:31:34 +0200 Subject: [PATCH 070/935] Fix inverted condition in _Atomic declaration fixes #1593 --- common.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common.h b/common.h index 123e3dee7..980099ee3 100644 --- a/common.h +++ b/common.h @@ -642,6 +642,7 @@ void gotoblas_profile_init(void); void gotoblas_profile_quit(void); #ifdef USE_OPENMP + #ifndef C_MSVC int omp_in_parallel(void); int omp_get_num_procs(void); @@ -649,12 +650,15 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif + #if (__STDC_VERSION__ >= 201112L) +#include +#else #ifndef _Atomic #define _Atomic volatile #endif -#include #endif + #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); From 15a78d6b662569a464de9a00517897b036fe7886 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 15:58:34 +0200 Subject: [PATCH 071/935] export NO_AVX512 setting --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 8c875d6f7..eaf3e9889 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1249,6 +1249,7 @@ export MSA_FLAGS export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE +export NO_AVX512 export SGEMM_UNROLL_M export SGEMM_UNROLL_N From e8002536ec90b74148abce1c3de9bca0061dbe32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 18:23:01 +0200 Subject: [PATCH 072/935] disable quiet_make for the moment --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index eaf3e9889..5c16e2bee 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -ifeq ($(QUIET_MAKE), 1) -MAKE += -s -endif +#ifeq ($(QUIET_MAKE), 1) +#MAKE += -s +#endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 From f6021c798dea23685af3eedcb63c4a388c78f226 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 19:09:38 +0200 Subject: [PATCH 073/935] Re-enable QUIET_MAKE --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 5c16e2bee..eaf3e9889 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -#ifeq ($(QUIET_MAKE), 1) -#MAKE += -s -#endif +ifeq ($(QUIET_MAKE), 1) +MAKE += -s +endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 From 7fb62aed7e2a08fb8fc62054a164d3479511ce82 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 23:29:33 +0200 Subject: [PATCH 074/935] Check build system support for AVX512 instructions --- cmake/system_check.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d47c38cdd..f054852bf 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -66,3 +66,12 @@ else() set(BINARY32 1) endif() +if (X86_64 OR X86) + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }") +execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) +if (NO_AVX512 EQUAL 1) +set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +endif() + file(REMOVE "avx512.tmp" "avx512.o") +endif() + From 06d43760e4ca2cc7007e54d88938eff9e95e0579 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 09:18:10 +0200 Subject: [PATCH 075/935] Restore _Atomic define before stdatomic.h for old gcc see #1593 --- common.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/common.h b/common.h index 123e3dee7..ecf07316d 100644 --- a/common.h +++ b/common.h @@ -649,12 +649,21 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif + #if (__STDC_VERSION__ >= 201112L) +#if defined(C_GCC) && ( __GNUC__ < 7) +// workaround for GCC bug 65467 #ifndef _Atomic #define _Atomic volatile #endif +#endif #include +#else +#ifndef _Atomic +#define _Atomic volatile #endif + + #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); From 83da278093e32f1e089a12d880c7ec65dfbb1457 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 09:27:49 +0200 Subject: [PATCH 076/935] Update common.h --- common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common.h b/common.h index cd1c4c0d1..663f37e7b 100644 --- a/common.h +++ b/common.h @@ -663,6 +663,7 @@ __declspec(dllimport) int __cdecl omp_get_num_procs(void); #ifndef _Atomic #define _Atomic volatile #endif +#endif #else #ifdef __ELF__ From 9b87b642624b398ebacee525edbc879cf3f950ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 16:49:00 +0200 Subject: [PATCH 077/935] Improve AVX512 testcase clang 3.4 managed to accept the original test code, only to fail on the actual Skylake asm later --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index dfe99350a..cc64c16c6 100644 --- a/c_check +++ b/c_check @@ -203,8 +203,8 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { - $code = '"vaddps %zmm1, %zmm0, %zmm0"'; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; + print $tmpf "int main(void){ __asm__ volatile($code); }\n"; $args = " -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args"); system(@cmd) == 0; From e4718b1fee0f8dcd0c892063d619477bd5ed31ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 16:51:30 +0200 Subject: [PATCH 078/935] Better AVX512 test case --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index f054852bf..a565fc0d5 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -67,7 +67,7 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }") + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") From ed7c4a043b3093dfe8ddb3d6d3e3d6fd6af43d4a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Jun 2018 10:18:26 +0200 Subject: [PATCH 079/935] Use usleep instead of sched_yield by default sched_yield only burns cpu cycles, fixes #900, see also #923, #1560 --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 663f37e7b..b7181e670 100644 --- a/common.h +++ b/common.h @@ -356,7 +356,7 @@ typedef int blasint; */ #ifndef YIELDING -#define YIELDING sched_yield() +#define YIELDING usleep(10) #endif /*** From e8880c1699816483090aa5574cf9b3322943831f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Jun 2018 10:26:55 +0200 Subject: [PATCH 080/935] Use a single thread for small input size copies daxpy improvement from #27, see #1560 --- interface/zaxpy.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/interface/zaxpy.c b/interface/zaxpy.c index fbb830ffb..529e78e79 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -41,7 +41,11 @@ #ifdef FUNCTION_PROFILE #include "functable.h" #endif - +#if defined(Z13) +#define MULTI_THREAD_MINIMAL 200000 +#else +#define MULTI_THREAD_MINIMAL 10000 +#endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in #endif #ifndef CBLAS - PRINT_DEBUG_CNAME; + PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif @@ -93,6 +97,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if (incx == 0 || incy == 0) nthreads = 1; + //Work around the low performance issue with small imput size & + //multithreads. + if (n <= MULTI_THREAD_MINIMAL) { + nthreads = 1; + } if (nthreads == 1) { #endif From 66316b9f4c8c7c48eed8b29e86f64581c02d45b0 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 7 Jun 2018 14:54:42 +0100 Subject: [PATCH 081/935] Improve performance of GEMM for small matrices when SMP is defined. Always checking num_cpu_avail() regardless of whether threading will actually be used adds noticeable overhead for small matrices. Most other uses of num_cpu_avail() do so only if threading will be used, so do the same here. --- interface/gemm.c | 27 ++++++--------------------- interface/trsm.c | 3 ++- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 8baf3fbec..a3bac5984 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -44,6 +44,7 @@ #endif #ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) @@ -52,6 +53,7 @@ #define ERROR_NAME "SGEMM " #endif #else +#define SMP_THRESHOLD_MIN 8192.0 #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " @@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB, FLOAT *sa, *sb; #ifdef SMP - int nthreads_max; - int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE @@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *sa, *sb; #ifdef SMP - int nthreads_max; - int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE @@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); - nthreads_max = num_cpu_avail(3); - nthreads_avail = nthreads_max; - -#ifndef COMPLEX MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; -#else - MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; -#endif - args.common = NULL; - - if ( nthreads_max > nthreads_avail ) - args.nthreads = nthreads_avail; + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) + args.nthreads = 1; else - args.nthreads = nthreads_max; - + args.nthreads = num_cpu_avail(3); + args.common = NULL; if (args.nthreads == 1) { #endif diff --git a/interface/trsm.c b/interface/trsm.c index 60c49795d..5c2750e79 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - args.nthreads = num_cpu_avail(3); if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; + else + args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { From 6c2d90ba7724b05e7fb97c7ec33324499e4a1a79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:29:17 +0200 Subject: [PATCH 082/935] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- CMakeLists.txt | 1 + Makefile | 3 +++ Makefile.install | 2 +- Makefile.rule | 5 +++++ Makefile.system | 17 ++++++++++++++++- 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49f20513..66c3d8afa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON endif() option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) +option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) ####### if(BUILD_WITHOUT_LAPACK) diff --git a/Makefile b/Makefile index 380ba1ce8..56b4426f8 100644 --- a/Makefile +++ b/Makefile @@ -153,6 +153,9 @@ ifeq ($(DYNAMIC_ARCH), 1) do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ done @echo DYNAMIC_ARCH=1 >> Makefile.conf_last +ifeq ($(DYNAMIC_OLDER), 1) + @echo DYNAMIC_OLDER=1 >> Makefile.conf_last +endif endif ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last diff --git a/Makefile.install b/Makefile.install index 21c3c9e22..c51c8a021 100644 --- a/Makefile.install +++ b/Makefile.install @@ -98,7 +98,7 @@ endif @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" diff --git a/Makefile.rule b/Makefile.rule index 1b4b8eb63..5c03d0195 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -17,6 +17,11 @@ VERSION = 0.3.1.dev # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 +# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH +# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON, +# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures) +# DYNAMIC_OLDER = 1 + # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. # CC = gcc diff --git a/Makefile.system b/Makefile.system index eaf3e9889..62ba0e466 100644 --- a/Makefile.system +++ b/Makefile.system @@ -472,7 +472,18 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += PENRYN DUNNINGTON +endif +DYNAMIC_CORE += NEHALEM +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += OPTERON OPTERON_SSE3 +endif +DYNAMIC_CORE += BARCELONA +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += BOBCAT ATOM NANO +endif ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif @@ -917,6 +928,10 @@ ifeq ($(DYNAMIC_ARCH), 1) CCOMMON_OPT += -DDYNAMIC_ARCH endif +ifeq ($(DYNAMIC_OLDER), 1) +CCOMMON_OPT += -DDYNAMIC_OLDER +endif + ifeq ($(NO_LAPACK), 1) CCOMMON_OPT += -DNO_LAPACK #Disable LAPACK C interface From 1cbd8f3ae47ffb89523fa247e81ffea07c6505a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:30:46 +0200 Subject: [PATCH 083/935] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- cmake/arch.cmake | 13 ++++++++++++- cmake/openblas.pc.in | 2 +- cmake/system.cmake | 3 +++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 527d2bec6..52fb64eaa 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -49,7 +49,18 @@ if (DYNAMIC_ARCH) endif () if (X86_64) - set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) + set(DYNAMIC_CORE PRESCOTT CORE2) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON) + endif () + set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3) + endif () + set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO) + endif () if (NOT NO_AVX) set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) endif () diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 35973b09b..ca88a6d5f 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,7 +1,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ -openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ +openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ diff --git a/cmake/system.cmake b/cmake/system.cmake index c21fe7c14..48e8f75bc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -163,6 +163,9 @@ endif () if (DYNAMIC_ARCH) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () endif () if (NO_LAPACK) From 63f7395fb49091295463785f6c1056f61dd64a7d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:31:38 +0200 Subject: [PATCH 084/935] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- driver/others/dynamic.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index acb2d8b8c..4271c0a0d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -56,16 +56,27 @@ EXTERN gotoblas_t gotoblas_BANIAS; EXTERN gotoblas_t gotoblas_ATHLON; extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_BARCELONA; +#ifdef DYNAMIC_OLDER extern gotoblas_t gotoblas_ATOM; extern gotoblas_t gotoblas_NANO; -extern gotoblas_t gotoblas_CORE2; extern gotoblas_t gotoblas_PENRYN; extern gotoblas_t gotoblas_DUNNINGTON; -extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; -extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_ATOM gotoblas_NEHALEM +#define gotoblas_NANO gotoblas_NEHALEM +#define gotoblas_PENRYN gotoblas_CORE2 +#define gotoblas_DUNNINGTON gotoblas_CORE2 +#define gotoblas_OPTERON gotoblas_CORE2 +#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 +#define gotoblas_BOBCAT gotoblas_CORE2 +#endif + #ifndef NO_AVX extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; From e9cd11768c20707eff31912db1bafc837c0224d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 17:54:36 +0200 Subject: [PATCH 085/935] Enable parallel make on MS Windows by default fixes #874 --- getarch.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/getarch.c b/getarch.c index fcffe63e2..31f41d62c 100644 --- a/getarch.c +++ b/getarch.c @@ -1196,9 +1196,7 @@ int main(int argc, char *argv[]){ #elif NO_PARALLEL_MAKE==1 printf("MAKE += -j 1\n"); #else -#ifndef OS_WINDOWS printf("MAKE += -j %d\n", get_num_cores()); -#endif #endif break; From 0bea6bb9e7e2468bc9d42f5ffdf27f772f2984af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 09:24:37 +0200 Subject: [PATCH 086/935] Create OpenBLASConfig.cmake from cmake as well --- CMakeLists.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49f20513..e1c308910 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON endif() option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) +option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) ####### if(BUILD_WITHOUT_LAPACK) @@ -208,6 +209,7 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} + EXPORT "OpenBLASTargets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) @@ -267,3 +269,21 @@ if(PKG_CONFIG_FOUND) configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() + + +# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". +set(PN OpenBLAS) +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +configure_package_config_file(cmake/${PN}Config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + VERSION ${${PN}_VERSION} + COMPATIBILITY AnyNewerVersion) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(EXPORT "${PN}Targets" + NAMESPACE "${PN}::" + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) + From 02634b549b678dc38c85ce4c77ebb532e8d9e471 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 09:25:46 +0200 Subject: [PATCH 087/935] Add template for OpenBLASConfig.cmake --- cmake/OpenBLASConfig.cmake.in | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 cmake/OpenBLASConfig.cmake.in diff --git a/cmake/OpenBLASConfig.cmake.in b/cmake/OpenBLASConfig.cmake.in new file mode 100644 index 000000000..87a1621b4 --- /dev/null +++ b/cmake/OpenBLASConfig.cmake.in @@ -0,0 +1,79 @@ +# OpenBLASConfig.cmake +# -------------------- +# +# OpenBLAS cmake module. +# This module sets the following variables in your project:: +# +# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system +# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release +# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located. +# OpenBLAS_INCLUDE_DIR - same as DIRS +# OpenBLAS_LIBRARIES - OpenBLAS library to link against. +# OpenBLAS_LIBRARY - same as LIBRARIES +# +# +# Available components:: +# +## shared - search for only shared library +## static - search for only static library +# serial - search for unthreaded library +# pthread - search for native pthread threaded library +# openmp - search for OpenMP threaded library +# +# +# Exported targets:: +# +# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED` +## target. Target is shared _or_ static, so, for both, use separate, not +## overlapping, installations. :: +# +# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached. +# +# +# Suggested usage:: +# +# find_package(OpenBLAS) +# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread) +# +# +# The following variables can be set to guide the search for this package:: +# +# OpenBLAS_DIR - CMake variable, set to directory containing this Config file +# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package +# PATH - environment variable, set to bin directory of this package +# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables +# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build + +@PACKAGE_INIT@ + +set(PN OpenBLAS) + +# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon +if(@USE_OPENMP@) + set(${PN}_openmp_FOUND 1) +elseif(@USE_THREAD@) + set(${PN}_pthread_FOUND 1) +else() + set(${PN}_serial_FOUND 1) +endif() + +check_required_components(${PN}) + +#----------------------------------------------------------------------------- +# Don't include targets if this file is being picked up by another +# project which has already built this as a subproject +#----------------------------------------------------------------------------- +if(NOT TARGET ${PN}::OpenBLAS) + include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake") + + get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION) + set(${PN}_LIBRARY ${_loc}) + get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES) + set(${PN}_LIBRARIES ${_ill}) + + get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES) + set(${PN}_INCLUDE_DIR ${_id}) + get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + set(${PN}_INCLUDE_DIRS ${_iid}) +endif() + From e65f451409e2150bf299a2cdd906bec4ffff7915 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 15:09:43 +0200 Subject: [PATCH 088/935] include CMakePackageConfigHelpers --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e1c308910..a2421ac54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,9 @@ set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${Open # Adhere to GNU filesystem layout conventions include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + + set(OpenBLAS_LIBNAME openblas) ####### From c2545b0fd6978e1fb09c2dc86b825846e0034228 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Mon, 11 Jun 2018 10:13:09 +0100 Subject: [PATCH 089/935] Fixed a few more unnecessary calls to num_cpu_avail. I don't have as many benchmarks for these as for gemm, but it should still make a difference for small matrices. --- interface/axpy.c | 14 ++++++-------- interface/scal.c | 5 +++-- interface/zaxpy.c | 14 ++++++-------- interface/zscal.c | 4 ++-- interface/zswap.c | 4 ++-- kernel/arm64/casum_thunderx2t99.c | 9 +++------ kernel/arm64/copy_thunderx2t99.c | 9 +++------ kernel/arm64/dasum_thunderx2t99.c | 9 +++------ kernel/arm64/dot_thunderx2t99.c | 11 ++++------- kernel/arm64/dznrm2_thunderx2t99.c | 4 ++-- kernel/arm64/dznrm2_thunderx2t99_fast.c | 4 ++-- kernel/arm64/iamax_thunderx2t99.c | 9 +++------ kernel/arm64/izamax_thunderx2t99.c | 9 +++------ kernel/arm64/sasum_thunderx2t99.c | 9 +++------ kernel/arm64/scnrm2_thunderx2t99.c | 4 ++-- kernel/arm64/zasum_thunderx2t99.c | 9 +++------ kernel/arm64/zdot_thunderx2t99.c | 9 +++------ kernel/x86_64/ddot.c | 15 ++++++--------- 18 files changed, 59 insertions(+), 92 deletions(-) diff --git a/interface/axpy.c b/interface/axpy.c index f0d95b395..39edea6af 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -40,11 +40,11 @@ #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" -#endif +#endif #if defined(Z13) #define MULTI_THREAD_MINIMAL 200000 #else -#define MULTI_THREAD_MINIMAL 10000 +#define MULTI_THREAD_MINIMAL 10000 #endif #ifndef CBLAS @@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc if (incy < 0) y -= (n - 1) * incy; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - + // //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/scal.c b/interface/scal.c index 3f468a2a3..6d07b1650 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #ifdef SMP - nthreads = num_cpu_avail(1); - if (n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 529e78e79..1a0259c96 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - - //Work around the low performance issue with small imput size & + // + //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) { + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; - } + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zscal.c b/interface/zscal.c index 633b6ecf5..bfaddc260 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ FUNCTION_PROFILE_START(); #ifdef SMP - nthreads = num_cpu_avail(1); - if ( n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/zswap.c b/interface/zswap.c index 5308cbe90..e33bbafba 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/kernel/arm64/casum_thunderx2t99.c b/kernel/arm64/casum_thunderx2t99.c index cd5d936c5..c6dbb3f77 100644 --- a/kernel/arm64/casum_thunderx2t99.c +++ b/kernel/arm64/casum_thunderx2t99.c @@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = casum_compute(n, x, inc_x); diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c index bd67b48b0..e31876139 100644 --- a/kernel/arm64/copy_thunderx2t99.c +++ b/kernel/arm64/copy_thunderx2t99.c @@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if (n <= 0) return 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { do_copy(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c index ba12fc776..a212c9534 100644 --- a/kernel/arm64/dasum_thunderx2t99.c +++ b/kernel/arm64/dasum_thunderx2t99.c @@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = dasum_compute(n, x, inc_x); diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c index 8eeb94f36..3940acddd 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_thunderx2t99.c @@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " faddp "DOTF", v0.2d \n" #endif /* !defined(DSDOT) */ -#else /* !defined(DOUBLE) */ +#else /* !defined(DOUBLE) */ #define KERNEL_F1 \ " ldr "TMPX", ["X"] \n" \ " ldr "TMPY", ["Y"] \n" \ @@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y RETURN_TYPE dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index 2aea9b4a9..b94f0cffc 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_compute(n, x, inc_x, &ssq, &scale); diff --git a/kernel/arm64/dznrm2_thunderx2t99_fast.c b/kernel/arm64/dznrm2_thunderx2t99_fast.c index 8b04a3eb6..8405b388b 100644 --- a/kernel/arm64/dznrm2_thunderx2t99_fast.c +++ b/kernel/arm64/dznrm2_thunderx2t99_fast.c @@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2 = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c index a11b18419..e3bec4a20 100644 --- a/kernel/arm64/iamax_thunderx2t99.c +++ b/kernel/arm64/iamax_thunderx2t99.c @@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = iamax_compute(n, x, inc_x); diff --git a/kernel/arm64/izamax_thunderx2t99.c b/kernel/arm64/izamax_thunderx2t99.c index 8d70b0515..b2e2828f0 100644 --- a/kernel/arm64/izamax_thunderx2t99.c +++ b/kernel/arm64/izamax_thunderx2t99.c @@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = izamax_compute(n, x, inc_x); diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c index 28fc34c62..014c667ba 100644 --- a/kernel/arm64/sasum_thunderx2t99.c +++ b/kernel/arm64/sasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = sasum_compute(n, x, inc_x); diff --git a/kernel/arm64/scnrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c index b8df4962b..f96de441e 100644 --- a/kernel/arm64/scnrm2_thunderx2t99.c +++ b/kernel/arm64/scnrm2_thunderx2t99.c @@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_double = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c index 140e5a741..1d303a9a3 100644 --- a/kernel/arm64/zasum_thunderx2t99.c +++ b/kernel/arm64/zasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = zasum_compute(n, x, inc_x); diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 70d683077..6185bc7d9 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA CIMAG(zdot) = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { zdot_compute(n, x, inc_x, y, inc_y, &zdot); diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 059549028..0dc9cd3da 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "ddot_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "ddot_microk_piledriver-2.c" -#elif defined(NEHALEM) +#elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" @@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; - BLASLONG n1 = n & -4; + BLASLONG n1 = n & -4; while(i < n1) { @@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); From 6f71c0fce45c86c55d12b6e12e69b9ccb8ec2f28 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Jun 2018 13:26:19 +0200 Subject: [PATCH 090/935] =?UTF-8?q?Return=20a=20somewhat=20sane=20default?= =?UTF-8?q?=20value=20for=20L2=20cache=20size=20if=20cpuid=20retur?= =?UTF-8?q?=E2=80=A6=20(#1611)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Return a somewhat sane default value for L2 cache size if cpuid returned something unexpected Fixes #1610, the KVM hypervisor on Google Chromebooks returning zero for CPUID 0x80000006, causing DYNAMIC_ARCH builds of OpenBLAS to hang --- kernel/setparam-ref.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 9030d7c6d..f654de110 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -647,7 +647,9 @@ static int get_l2_size_old(void){ return 6144; } } - return 0; +// return 0; +fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); +return 256; } #endif @@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){ l2 = BITMASK(ecx, 16, 0xffff); #ifndef ARCH_X86 + if (l2 <= 0) { + fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); + return 256; + } return l2; #else From de8fff671d6081bf543b55c95655fe5f6b5e4007 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Jun 2018 17:05:27 +0200 Subject: [PATCH 091/935] Revert "Use usleep instead of sched_yield by default" --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index b7181e670..663f37e7b 100644 --- a/common.h +++ b/common.h @@ -356,7 +356,7 @@ typedef int blasint; */ #ifndef YIELDING -#define YIELDING usleep(10) +#define YIELDING sched_yield() #endif /*** From fcb77ab129821690fac4e532640c5cfa786c3a79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Jun 2018 16:57:58 +0200 Subject: [PATCH 092/935] Update OSX deployment target to 10.8 fixes #1580 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 62ba0e466..5dffd8d2e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -248,7 +248,7 @@ endif ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET -export MACOSX_DEPLOYMENT_TARGET=10.6 +export MACOSX_DEPLOYMENT_TARGET=10.8 endif MD5SUM = md5 -r endif From bf40f806efa55c7a7c7ec57535919598eaeb569d Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 14 Jun 2018 12:18:04 +0100 Subject: [PATCH 093/935] Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something. --- driver/others/memory.c | 199 +++++++++-------------------------------- 1 file changed, 43 insertions(+), 156 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index d69e52e97..85f790615 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif +#ifndef BUFFERS_PER_THREAD +#ifdef USE_OPENMP +#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#else +#define BUFFERS_PER_THREAD NUM_BUFFERS +#endif +#endif + #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -213,7 +221,7 @@ int i,n; ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -415,8 +423,15 @@ struct release_t { int hugetlb_allocated = 0; -static struct release_t release_info[NUM_BUFFERS]; -static int release_pos = 0; +#if defined(OS_WINDOWS) +#define THREAD_LOCAL __declspec(thread) +#define UNLIKELY_TO_BE_ZERO(x) (x) +#else +#define THREAD_LOCAL __thread +#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0)) +#endif +static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD]; +static int THREAD_LOCAL release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } #ifdef OS_LINUX @@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){ #endif if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } return map_address; @@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; @@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -static volatile struct { - BLASULONG lock; +struct memory_t { void *addr; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int pos; -#endif int used; #ifndef __64BIT__ char dummy[48]; #else char dummy[40]; #endif +}; -} memory[NUM_BUFFERS]; +static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD]; static int memory_initialized = 0; @@ -987,9 +987,6 @@ static int memory_initialized = 0; void *blas_memory_alloc(int procpos){ int position; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; -#endif void *map_address; @@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){ }; void *(**func)(void *address); -#if defined(USE_OPENMP) - if (!memory_initialized) { -#endif - - LOCK_COMMAND(&alloc_lock); + if (UNLIKELY_TO_BE_ZERO(memory_initialized)) { - if (!memory_initialized) { + /* Only allow a single thread to initialize memory system */ + LOCK_COMMAND(&alloc_lock); -#if defined(WHEREAMI) && !defined(USE_OPENMP) - for (position = 0; position < NUM_BUFFERS; position ++){ - memory[position].addr = (void *)0; - memory[position].pos = -1; - memory[position].used = 0; - memory[position].lock = 0; - } -#endif + if (!memory_initialized) { #ifdef DYNAMIC_ARCH - gotoblas_dynamic_init(); + gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) - gotoblas_affinity_init(); + gotoblas_affinity_init(); #endif #ifdef SMP - if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #ifndef DYNAMIC_ARCH - blas_set_parameter(); + blas_set_parameter(); #endif #endif - memory_initialized = 1; + memory_initialized = 1; + } + UNLOCK_COMMAND(&alloc_lock); } - UNLOCK_COMMAND(&alloc_lock); -#if defined(USE_OPENMP) - } -#endif #ifdef DEBUG printf("Alloc Start ...\n"); -#endif - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - mypos = WhereAmI(); - - position = mypos; - while (position >= NUM_BUFFERS) position >>= 1; - - do { - if (!memory[position].used && (memory[position].pos == mypos)) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - blas_lock(&memory[position].lock); -#endif - if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif - } - - position ++; - - } while (position < NUM_BUFFERS); - - #endif position = 0; do { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - if (!memory[position].used) { - blas_lock(&memory[position].lock); -#endif if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); - } -#endif - position ++; - } while (position < NUM_BUFFERS); + } while (position < BUFFERS_PER_THREAD); goto error; @@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif if (!memory[position].addr) { do { @@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); } #endif #ifdef ALLOC_HUGETLBFILE if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); #endif } #endif @@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - if (memory[position].pos == -1) memory[position].pos = mypos; - -#endif - -#ifdef DYNAMIC_ARCH - - if (memory_initialized == 1) { - - LOCK_COMMAND(&alloc_lock); - - if (memory_initialized == 1) { - - if (!gotoblas) gotoblas_dynamic_init(); - - memory_initialized = 2; - } - - UNLOCK_COMMAND(&alloc_lock); - - } -#endif - - #ifdef DEBUG printf("Mapped : %p %3d\n\n", (void *)memory[position].addr, position); @@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); return NULL; } @@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area)) position++; if (memory[position].addr != free_area) goto error; @@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif - // arm: ensure all writes are finished before other thread takes this memory - WMB; - memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){ printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); #ifdef DEBUG - for (position = 0; position < NUM_BUFFERS; position++) + for (position = 0; position < BUFFERS_PER_THREAD; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); -#endif -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); #endif return; } @@ -1293,8 +1188,6 @@ void blas_shutdown(void){ BLASFUNC(blas_thread_shutdown)(); #endif - LOCK_COMMAND(&alloc_lock); - for (pos = 0; pos < release_pos; pos ++) { release_info[pos].func(&release_info[pos]); } @@ -1305,17 +1198,11 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < NUM_BUFFERS; pos ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ memory[pos].addr = (void *)0; memory[pos].used = 0; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - memory[pos].pos = -1; -#endif - memory[pos].lock = 0; } - UNLOCK_COMMAND(&alloc_lock); - return; } From 47bf0dba8f7a9cbd559e2f9cabe0bf2c7d3ee7a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Jun 2018 11:25:05 +0200 Subject: [PATCH 094/935] Add build-time option for OMP scheduler; document MULTITHREAD_THRESHOLD range (#1620) * Allow choosing the OpenMP scheduler and add range hint for GEMM_MULTITHREAD_THRESHOLD * Amended description of GEMM_MULTITHREAD_THRESHOLD to reflect #742 making it track floating point operations rather than matrix size --- Makefile.rule | 15 +++++++++++++-- driver/others/blas_server_omp.c | 6 +++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 5c03d0195..649aabe70 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev # This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 +# The OpenMP scheduler to use - by default this is "static" and you +# will normally not want to change this unless you know that your main +# workload will involve tasks that have highly unbalanced running times +# for individual threads. Changing away from "static" may also adversely +# affect memory access locality in NUMA systems. Setting to "runtime" will +# allow you to select the scheduler from the environment variable OMP_SCHEDULE +# CCOMMON_OPT += -DOMP_SCHED=dynamic + # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. @@ -156,8 +164,11 @@ NO_AFFINITY = 1 # CONSISTENT_FPCSR = 1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute -# with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. +# with single thread. (Actually in recent versions this is a factor proportional to the +# number of floating point operations necessary for the given problem size, no longer +# an individual dimension). You can use this setting to avoid the overhead of multi- +# threading in small matrix sizes. The default value is 4, but values as high as 50 have +# been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 # If you need santy check by comparing reference BLAS. It'll be very diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index fccdb4320..4255852c8 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -48,6 +48,10 @@ #else +#ifndef OMP_SCHED +#define OMP_SCHED static +#endif + int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 From 9e162146a93a58a06515bc53f07e37b8924e0d67 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:32:03 +0000 Subject: [PATCH 095/935] Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow) --- driver/level3/level3_thread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 4ab1ee8cc..018813b8c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -658,8 +658,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG } /* Clear synchronization flags */ - for (i = 0; i < MAX_CPU_NUMBER; i++) { - for (j = 0; j < MAX_CPU_NUMBER; j++) { + for (i = 0; i < nthreads; i++) { + for (j = 0; j < nthreads; j++) { for (k = 0; k < DIVIDE_RATE; k++) { job[i].working[j][CACHE_LINE_SIZE * k] = 0; } From d148ec4ea18e672dacb1270d4a5308ccaaae18bc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:39:15 +0000 Subject: [PATCH 096/935] Don't use _Atomic for jobs sometimes... The use of _Atomic leads to really bad code generation in the compiler (on x86, you get 2 "mfence" memory barriers around each access with gcc8, despite x86 being ordered and cache coherent). But there's a fallback in the code that just uses volatile which is more than plenty in practice. If we're nervous about cross thread synchronization for these variables, we should make the YIELD function be a compiler/memory barrier instead. performance before (after last commit) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% Performance with this patch (roughly a 2x improvement): Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10756.0 10.5 -0.5% 18296.7 6.1 -1.7% 64 x 64 20490.0 12.9 1.4% 40615.0 6.5 0.0% 65 x 65 83528.3 3.3 -210.9% 96319.0 2.9 -83.3% 80 x 80 101453.5 5.1 -166.3% 128021.7 4.0 -76.6% 96 x 96 149795.1 5.9 -143.1% 168059.4 5.3 -47.4% 112 x 112 191481.2 7.3 -105.8% 204165.0 6.9 -14.6% 128 x 128 265019.2 7.9 -99.0% 272006.4 7.7 -5.3% --- driver/level3/level3_thread.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 018813b8c..7e75f69d1 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,11 +91,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L -_Atomic -#else volatile -#endif BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; From 5c6f008365ee3c6d42f8630d27259f130a688468 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:47:50 +0000 Subject: [PATCH 097/935] Tune param.h for SkylakeX param.h defines a per-platform SWITCH_RATIO, which is used as a measure for how fine grained the blocks for gemm need to be split up. Many platforms define this to 4. The reality is that the gemm low level implementation for SkylakeX likes bigger blocks due to the nature of SIMD... by tuning the SWITCH_RATIO to 32 the threading performance improves significantly: Before Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10756.0 10.5 -0.5% 18296.7 6.1 -1.7% 64 x 64 20490.0 12.9 1.4% 40615.0 6.5 0.0% 65 x 65 83528.3 3.3 -210.9% 96319.0 2.9 -83.3% 80 x 80 101453.5 5.1 -166.3% 128021.7 4.0 -76.6% 96 x 96 149795.1 5.9 -143.1% 168059.4 5.3 -47.4% 112 x 112 191481.2 7.3 -105.8% 204165.0 6.9 -14.6% 128 x 128 265019.2 7.9 -99.0% 272006.4 7.7 -5.3% After Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10666.3 10.6 0.4% 18236.9 6.2 -1.4% 64 x 64 20410.1 13.0 1.8% 39925.8 6.6 1.7% 65 x 65 34983.0 7.9 -30.2% 51494.6 5.4 2.0% 80 x 80 39769.1 13.0 -4.4% 63805.2 8.1 12.0% 96 x 96 45169.6 19.7 26.7% 80065.8 11.1 29.8% 112 x 112 57026.1 24.7 38.7% 99535.5 14.2 44.1% 128 x 128 64789.8 32.5 51.3% 117407.2 17.9 54.6% With this change, threading starts to be a win already at 96x96 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 49a5e85e8..3573fffbb 100644 --- a/param.h +++ b/param.h @@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 From 6eb4b9ae7c7cc58af00ac21b52fed8810d7e5710 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:05:04 +0000 Subject: [PATCH 098/935] Tune HASWELL SWITCH_RATIO as well Similar to the SKYLAKEX patch, 32 seems to work best (much better than 4 or 16) Before (4) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15554.3 7.2 0.2% 30353.8 3.7 0.3% 64 x 64 30346.8 8.7 1.6% 63495.0 4.1 -0.1% 65 x 65 81668.1 3.4 -123.3% 82705.2 3.3 -21.2% 80 x 80 105045.9 4.9 -95.5% 115226.0 4.5 -2.2% 96 x 96 152461.2 5.8 -74.3% 148156.3 6.0 16.4% 112 x 112 188505.2 7.5 -42.2% 171187.3 8.2 36.4% 128 x 128 257884.0 8.1 -39.5% 224764.8 9.3 46.0% Intermediate (16) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15565.7 7.2 0.2% 30378.9 3.7 0.2% 64 x 64 30430.2 8.7 1.3% 63046.4 4.2 0.6% 65 x 65 27306.0 10.1 25.3% 38879.2 7.1 43.0% 80 x 80 51008.7 10.1 5.1% 61007.6 8.4 45.9% 96 x 96 70856.7 12.5 19.0% 83403.1 10.6 53.0% 112 x 112 84769.9 16.6 36.0% 99920.1 14.1 62.9% 128 x 128 84213.2 25.0 54.5% 113024.2 18.6 72.8% After (32) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15537.3 7.2 0.3% 30537.0 3.6 -0.3% 64 x 64 30352.7 8.7 1.6% 62597.8 4.2 1.3% 65 x 65 36857.0 7.5 -0.8% 56167.6 4.9 17.7% 80 x 80 42552.6 12.1 20.8% 69536.7 7.4 38.3% 96 x 96 52101.5 17.1 40.5% 91016.1 9.7 48.7% 112 x 112 63853.7 22.1 51.8% 110507.4 12.7 58.9% 128 x 128 73966.1 28.4 60.0% 163146.4 12.9 60.8% --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3573fffbb..cfa4bba5c 100644 --- a/param.h +++ b/param.h @@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 From 73de17664dfdf2934a2fdc6dd9442107e6c85035 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:50:43 +0000 Subject: [PATCH 099/935] Add missing barriers in gemm scheduler a few places in the gemm scheduler code were missing barriers; the code likely worked OK due to heavy use of volatile / _Atomic but there's no reason to get this incorrect --- driver/level3/level3_thread.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 7e75f69d1..aeb5e6ed4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -347,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using workspace */ START_RPCC(); for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); #if defined(FUSED_GEMM) && !defined(TIMING) @@ -409,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until other region of B is initialized */ START_RPCC(); - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); /* Apply kernel with local region of A and part of other region of B */ @@ -427,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; } } } while (current != mypos); @@ -488,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (js = 0; js < DIVIDE_RATE; js++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; } } STOP_RPCC(waiting3); From 7e39ffe1135ee6ca1dc119f6eea9566668fd0916 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:53:15 +0000 Subject: [PATCH 100/935] On x86-64, make MB/WMB compiler barriers Whie on x86(64) one does not normally need full memory barriers, it's good practice to at least use compiler barriers for places where on other architectures memory barriers are used; this prevents the compiler from over-optimizing. --- common_x86_64.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index 7461aaf60..3236778b8 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -60,8 +60,13 @@ #endif */ +#ifdef __GNUC__ +#define MB __asm__ __volatile__("": : :"memory") +#define WMB __asm__ __volatile__("": : :"memory") +#else #define MB #define WMB +#endif static void __inline blas_lock(volatile BLASULONG *address){ From 2ddc96c9e5a86e3fd12954b3efc269f0cc8d07d8 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 18:06:24 +0000 Subject: [PATCH 101/935] make WMB / MB safer on x86-64 make it so that if (foo) RMB; else MB; is always done correctly and without syntax surprises --- common_x86_64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 3236778b8..62e138e34 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -61,11 +61,11 @@ */ #ifdef __GNUC__ -#define MB __asm__ __volatile__("": : :"memory") -#define WMB __asm__ __volatile__("": : :"memory") +#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) #else -#define MB -#define WMB +#define MB do {} while (0) +#define WMB do {} while (0) #endif static void __inline blas_lock(volatile BLASULONG *address){ From 2d8cc7193ace18c28ea05ef39e13bb28437b6d89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Jun 2018 23:38:14 +0200 Subject: [PATCH 102/935] Support upcoming Intel Cannon Lake CPUs as Skylake X (#1621) * Support upcoming Cannon Lake as Skylake X --- cpuid_x86.c | 17 +++++++++++++++++ driver/others/dynamic.c | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index fc937865c..89eb809b0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1339,6 +1339,23 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 6: + switch (model) { + case 6: // Cannon Lake +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif + } + break; case 9: case 8: switch (model) { diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4271c0a0d..bacd3b7fa 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } return NULL; + case 6: + if (model == 6) { + // Cannon Lake +#ifndef NO_AVX512 + return &gotoblas_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return &gotoblas_HASWELL; +#else + return &gotblas_SANDYBRIDGE; +#endif + else + return &gotoblas_NEHALEM; +#endif + } + return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake From 1f9e4f319327dd53d1243edb3a812c5a2366a938 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:46:36 +0200 Subject: [PATCH 103/935] Handle special case of gfortran+clang+OpenMP --- ctest/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ctest/Makefile b/ctest/Makefile index 6eda43863..569a5dda3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -102,7 +102,13 @@ clean :: rm -f x* FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) From 6a5ab083b7e78458861b197b8e98b2506345d6d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:47:33 +0200 Subject: [PATCH 104/935] Handle special case of gfortran+clang+OpenMP --- test/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/Makefile b/test/Makefile index 65fb6f438..074411b05 100644 --- a/test/Makefile +++ b/test/Makefile @@ -122,8 +122,13 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = - +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) From 10b70c904d9e3b610d35f1efe8d89888da4011bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:53:19 +0200 Subject: [PATCH 105/935] Handle erroneous user settings NOFORTRAN=0 and NO_FORTRAN --- Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile b/Makefile index 56b4426f8..728567f80 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif +ifeq ($(NOFORTRAN), 0) +undefine NOFORTRAN +endif + +ifeq ($(NO_FORTRAN), 1) +undefine NO_FORTRAN +NOFORTRAN=1 +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench From 9369d3e6e5207c6974af162e67d4060ed625c322 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 23:28:06 +0200 Subject: [PATCH 106/935] Modify NOFORTRAN tests to always check the value; fix rewriting of NO_FORTRAN --- Makefile | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 728567f80..4760be0be 100644 --- a/Makefile +++ b/Makefile @@ -21,13 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif -ifeq ($(NOFORTRAN), 0) -undefine NOFORTRAN -endif - ifeq ($(NO_FORTRAN), 1) -undefine NO_FORTRAN -NOFORTRAN=1 +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) @@ -56,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -117,7 +119,7 @@ endif endif tests : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -219,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -240,7 +242,7 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -283,21 +285,21 @@ endif endif large.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From 952541e840bddbcdcdfce81aefc09edf7fbfb84f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jun 2018 13:20:30 +0200 Subject: [PATCH 107/935] Need to use filter-out to handle NOFORTRAN not set --- Makefile | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 4760be0be..49dab6484 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +119,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,7 +242,10 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) + $(info filter value of NOFORTRAN is:) + $(info x$(filter-out $(NOFORTRAN), 1 2)x) + +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc From 0c5b7b400b3973d214ce24c566be4446743eacf7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jun 2018 15:16:19 +0200 Subject: [PATCH 108/935] Add -march=skylake-avx512 to flags if target is skylake x --- Makefile.x86_64 | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 1ba63278a..677c05d93 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,13 @@ endif endif endif +ifeq ($(CORE), SKYLAKEX) +ifndef NO_AVX512 +CCOMMON_OPT += -march=skylake-avx512 +FCOMMON_OPT += -march=skylake-avx512 +endif +endif + ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 endif From 05978528c3f3c61fb370e1fae0ac3013faaa595e Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Wed, 20 Jun 2018 17:03:18 +0100 Subject: [PATCH 109/935] Avoid declaring arrays of size 0 when making large stack allocations. --- common_stackalloc.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/common_stackalloc.h b/common_stackalloc.h index 71fb1a477..ec0fa1611 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - large enough to support all architectures and kernel * Chosing a too small SIZE will lead to a stack smashing. */ -#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ - /* make it volatile because some function (ex: dgemv_n.S) */ \ - /* do not restore all register */ \ - volatile int stack_alloc_size = SIZE; \ - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ - stack_alloc_size = 0; \ - STACK_ALLOC_PROTECT_SET \ - TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ +#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ + /* make it volatile because some function (ex: dgemv_n.S) */ \ + /* do not restore all register */ \ + volatile int stack_alloc_size = SIZE; \ + if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ + STACK_ALLOC_PROTECT_SET \ + /* Avoid declaring an array of length 0 */ \ + TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ + __attribute__((aligned(0x20))); \ BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); #else //Original OpenBLAS/GotoBLAS codes. From a399d004257b2f43e8211341f924f3a73171b98c Mon Sep 17 00:00:00 2001 From: oon3m0oo Date: Wed, 20 Jun 2018 21:04:03 +0100 Subject: [PATCH 110/935] Further improvements to memory.c. (#1625) - Compiler TLS is now used only used when the compiler supports it - If compiler TLS is unsupported, we use platform-specific TLS - Only one variable (an index) is now in TLS - We only access TLS once per alloc, and never when freeing - Allocation / release info is now stored within the allocation itself, by over-allocating; this saves having external structures do the bookkeeping, and reduces some of the redundant data that was being stored (such as addresses) - We never hit the alloc lock when not using SMP or when using OpenMP (that was my fault) - Now that there are fewer tracking structures I think this is a bit easier to read than before --- driver/others/memory.c | 397 +++++++++++++++++++++++++---------------- 1 file changed, 242 insertions(+), 155 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 85f790615..ed20cf5cd 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -326,6 +326,8 @@ int goto_get_num_procs (void) { return blas_cpu_number; } +static void blas_memory_init(); + void openblas_fork_handler() { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is @@ -337,7 +339,7 @@ void openblas_fork_handler() // implementation of OpenMP. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; - err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); + err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init); if(err != 0) openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); #endif @@ -415,23 +417,104 @@ int openblas_get_num_threads(void) { #endif } -struct release_t { - void *address; - void (*func)(struct release_t *); - long attr; -}; - int hugetlb_allocated = 0; #if defined(OS_WINDOWS) #define THREAD_LOCAL __declspec(thread) -#define UNLIKELY_TO_BE_ZERO(x) (x) +#define LIKELY_ONE(x) (x) #else #define THREAD_LOCAL __thread -#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0)) +#define LIKELY_ONE(x) (__builtin_expect(x, 1)) +#endif + +/* Stores information about the allocation and how to release it */ +struct alloc_t { + /* Whether this allocation is being used */ + int used; + /* Any special attributes needed when releasing this allocation */ + int attr; + /* Function that can properly release this memory */ + void (*release_func)(struct alloc_t *); + /* Pad to 64-byte alignment */ + char pad[64 - 2 * sizeof(int) - sizeof(void(*))]; +}; + +/* Convenience macros for storing release funcs */ +#define STORE_RELEASE_FUNC(address, func) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + } + +#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + alloc_info->attr = attr; \ + } + +/* The number of bytes that will be allocated for each buffer. When allocating + memory, we store an alloc_t followed by the actual buffer memory. This means + that each allocation always has its associated alloc_t, without the need + for an auxiliary tracking structure. */ +static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); + +/* Clang supports TLS from version 2.8 */ +#if defined(__clang__) && __clang_major__ > 2 || \ + (__clang_minor__ == 2 || __clang_minor__ == 8) +#define HAS_COMPILER_TLS #endif -static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD]; -static int THREAD_LOCAL release_pos = 0; + +/* GCC supports TLS from version 4.1 */ +#if !defined(__clang__) && defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) +#define HAS_COMPILER_TLS +#endif + +/* MSVC supports TLS from version 2005 */ +#if defined(_MSC_VER) && _MSC_VER >= 1400 +#define HAS_COMPILER_TLS +#endif + +/* Versions of XCode before 8 did not properly support TLS */ +#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 +#undef HAS_COMPILER_TLS +#endif + +/* Android NDK's before version 12b did not support TLS */ +#if defined(__ANDROID__) && defined(__clang__) +#if __has_include() +#include +#endif +#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ + defined(__NDK_MINOR__) && \ + ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) +#undef HAS_COMPILER_TLS +#endif +#endif + +/* Holds pointers to allocated memory */ +#if defined(SMP) && !defined(USE_OPENMP) +/* This is the number of threads than can be spawned by the server, which is the + server plus the number of threads in the thread pool */ +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +static int next_memory_table_pos = 0; +# if defined(HAS_COMPILER_TLS) +/* Use compiler generated thread-local-storage */ +static int THREAD_LOCAL local_memory_table_pos = 0; +# else +/* Use system-dependent thread-local-storage */ +# if defined(OS_WINDOWS) +static DWORD local_storage_key; +# else +static pthread_key_t local_storage_key; +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#else +/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ +# define MAX_ALLOCATING_THREADS 1 +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -447,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +/* Returns a pointer to the start of the per-thread memory allocation data */ +static __inline struct alloc_t ** get_memory_table() { +#if defined(SMP) && !defined(USE_OPENMP) +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); +# else + int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + if (!local_memory_table_pos) { + LOCK_COMMAND(&alloc_lock); + local_memory_table_pos = next_memory_table_pos++; + UNLOCK_COMMAND(&alloc_lock); + if (next_memory_table_pos > MAX_ALLOCATING_THREADS) + printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); +# else + pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + } + return local_memory_table[local_memory_table_pos]; +#else + return local_memory_table[0]; +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +} + #ifdef ALLOC_MMAP -static void alloc_mmap_free(struct release_t *release){ +static void alloc_mmap_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : munmap failed\n"); } } @@ -465,22 +578,18 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); } - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif return map_address; @@ -533,25 +642,25 @@ static void *alloc_mmap(void *address){ if (address){ /* Just give up use advanced operation */ - map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #if defined(OS_LINUX) && !defined(NO_WARMUP) if (hot_alloc == 0) { - map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #endif - map_address = mmap(NULL, BUFFER_SIZE * SCALING, + map_address = mmap(NULL, allocation_block_size * SCALING, MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { @@ -559,7 +668,7 @@ static void *alloc_mmap(void *address){ #ifdef OS_LINUX #ifdef DEBUG int ret=0; - ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; perror("OpenBLAS alloc_mmap:"); @@ -567,7 +676,7 @@ static void *alloc_mmap(void *address){ } #else - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif @@ -575,7 +684,7 @@ static void *alloc_mmap(void *address){ allocsize = DGEMM_P * DGEMM_Q * sizeof(double); start = (BLASULONG)map_address; - current = (SCALING - 1) * BUFFER_SIZE; + current = (SCALING - 1) * allocation_block_size; while(current > 0) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; @@ -590,7 +699,7 @@ static void *alloc_mmap(void *address){ best = (BLASULONG)-1; best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { current = run_bench(start, allocsize); @@ -606,7 +715,7 @@ static void *alloc_mmap(void *address){ if ((BLASULONG)best_address > (BLASULONG)map_address) munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); - munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address); map_address = best_address; @@ -619,11 +728,7 @@ static void *alloc_mmap(void *address){ } #endif - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); return map_address; } @@ -635,9 +740,9 @@ static void *alloc_mmap(void *address){ #ifdef ALLOC_MALLOC -static void alloc_malloc_free(struct release_t *release){ +static void alloc_malloc_free(struct alloc_t *alloc_info){ - free(release -> address); + free(alloc_info); } @@ -645,15 +750,11 @@ static void *alloc_malloc(void *address){ void *map_address; - map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_malloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_malloc_free); return map_address; @@ -670,24 +771,20 @@ void *qfree (void *address); #define QCOMMS 0x2 #define QFAST 0x4 -static void alloc_qalloc_free(struct release_t *release){ +static void alloc_qalloc_free(struct alloc_t *alloc_info){ - qfree(release -> address); + qfree(alloc_info); } static void *alloc_qalloc(void *address){ void *map_address; - map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_qalloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_qalloc_free); return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); } @@ -696,9 +793,9 @@ static void *alloc_qalloc(void *address){ #ifdef ALLOC_WINDOWS -static void alloc_windows_free(struct release_t *release){ +static void alloc_windows_free(struct alloc_t *alloc_info){ - VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT); } @@ -706,17 +803,13 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_windows_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_windows_free); return map_address; } @@ -728,13 +821,14 @@ static void *alloc_windows(void *address){ #define DEVICEDRIVER_NAME "/dev/mapper" #endif -static void alloc_devicedirver_free(struct release_t *release){ +static void alloc_devicedirver_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(address, allocation_block_size)) { printf("OpenBLAS : Bugphysarea unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : Bugphysarea close failed.\n"); } @@ -751,17 +845,12 @@ static void *alloc_devicedirver(void *address){ } - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_devicedirver_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd); return map_address; } @@ -770,9 +859,9 @@ static void *alloc_devicedirver(void *address){ #ifdef ALLOC_SHM -static void alloc_shm_free(struct release_t *release){ +static void alloc_shm_free(struct alloc_t *alloc_info){ - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Shared memory unmap failed.\n"); } } @@ -781,22 +870,21 @@ static void *alloc_shm(void *address){ void *map_address; int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600); map_address = (void *)shmat(shmid, address, 0); if (map_address != (void *)-1){ #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif shmctl(shmid, IPC_RMID, 0); - release_info[release_pos].address = map_address; - release_info[release_pos].attr = shmid; - release_info[release_pos].func = alloc_shm_free; - release_pos ++; + struct alloc_t *alloc_info = (struct alloc_t *)map_address; + alloc_info->release_func = alloc_shm_free; + alloc_info->attr = shmid; } return map_address; @@ -804,23 +892,23 @@ static void *alloc_shm(void *address){ #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS -static void alloc_hugetlb_free(struct release_t *release){ +static void alloc_hugetlb_free(struct alloc_t *alloc_info){ #if defined(OS_LINUX) || defined(OS_AIX) - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Hugepage unmap failed.\n"); } #endif #ifdef __sun__ - munmap(release -> address, BUFFER_SIZE); + munmap(alloc_info, allocation_block_size); #endif #ifdef OS_WINDOWS - VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT); #endif @@ -833,7 +921,7 @@ static void *alloc_hugetlb(void *address){ #if defined(OS_LINUX) || defined(OS_AIX) int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, + shmid = shmget(IPC_PRIVATE, allocation_block_size, #ifdef OS_LINUX SHM_HUGETLB | #endif @@ -846,7 +934,7 @@ static void *alloc_hugetlb(void *address){ map_address = (void *)shmat(shmid, address, SHM_RND); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif if (map_address != (void *)-1){ @@ -863,7 +951,7 @@ static void *alloc_hugetlb(void *address){ mha.mha_pagesize = HUGE_PAGESIZE; memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); - map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size); #endif #ifdef OS_WINDOWS @@ -887,7 +975,7 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); @@ -898,11 +986,7 @@ static void *alloc_hugetlb(void *address){ #endif - if (map_address != (void *)-1){ - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_hugetlb_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free); return map_address; } @@ -914,13 +998,14 @@ static void *alloc_hugetlb(void *address){ static int hugetlb_pid = 0; -static void alloc_hugetlbfile_free(struct release_t *release){ +static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : HugeTLBfs unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : HugeTLBfs close failed.\n"); } } @@ -941,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_hugetlbfile_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd); return map_address; } @@ -964,19 +1044,11 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -struct memory_t { - void *addr; - int used; -#ifndef __64BIT__ - char dummy[48]; +#if __STDC_VERSION__ >= 201112L +static _Atomic int memory_initialized = 0; #else - char dummy[40]; +static volatile int memory_initialized = 0; #endif -}; - -static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD]; - -static int memory_initialized = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ @@ -984,6 +1056,20 @@ static int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ +static void blas_memory_init(){ +#if defined(SMP) && !defined(USE_OPENMP) + next_memory_table_pos = 0; +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + local_storage_key = ::TlsAlloc(); +# else + pthread_key_create(&local_storage_key, NULL); +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#endif /* defined(SMP) && !defined(USE_OPENMP) */ + memset(local_memory_table, 0, sizeof(local_memory_table)); +} + void *blas_memory_alloc(int procpos){ int position; @@ -1016,14 +1102,17 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + struct alloc_t * alloc_info; + struct alloc_t ** alloc_table; - if (UNLIKELY_TO_BE_ZERO(memory_initialized)) { - + if (!LIKELY_ONE(memory_initialized)) { +#if defined(SMP) && !defined(USE_OPENMP) /* Only allow a single thread to initialize memory system */ LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { - +#endif + blas_memory_init(); #ifdef DYNAMIC_ARCH gotoblas_dynamic_init(); #endif @@ -1044,8 +1133,10 @@ void *blas_memory_alloc(int procpos){ memory_initialized = 1; +#if defined(SMP) && !defined(USE_OPENMP) } UNLOCK_COMMAND(&alloc_lock); +#endif } #ifdef DEBUG @@ -1053,9 +1144,9 @@ void *blas_memory_alloc(int procpos){ #endif position = 0; - + alloc_table = get_memory_table(); do { - if (!memory[position].used) goto allocation; + if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; } while (position < BUFFERS_PER_THREAD); @@ -1068,9 +1159,8 @@ void *blas_memory_alloc(int procpos){ printf(" Position -> %d\n", position); #endif - memory[position].used = 1; - - if (!memory[position].addr) { + alloc_info = alloc_table[position]; + if (!alloc_info) { do { #ifdef DEBUG printf("Allocation Start : %lx\n", base_address); @@ -1082,7 +1172,7 @@ void *blas_memory_alloc(int procpos){ while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { @@ -1110,23 +1200,24 @@ void *blas_memory_alloc(int procpos){ #endif if (((BLASLONG) map_address) == -1) base_address = 0UL; - if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE; } while ((BLASLONG)map_address == -1); - memory[position].addr = map_address; + alloc_table[position] = alloc_info = map_address; #ifdef DEBUG - printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); + printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position); #endif } #ifdef DEBUG - printf("Mapped : %p %3d\n\n", - (void *)memory[position].addr, position); + printf("Mapped : %p %3d\n\n", (void *)alloc_info, position); #endif - return (void *)memory[position].addr; + alloc_info->used = 1; + + return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); error: printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); @@ -1134,25 +1225,19 @@ void *blas_memory_alloc(int procpos){ return NULL; } -void blas_memory_free(void *free_area){ - - int position; - +void blas_memory_free(void *buffer){ #ifdef DEBUG - printf("Unmapped Start : %p ...\n", free_area); + int position; + struct alloc_t ** alloc_table; #endif - - position = 0; - while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area)) - position++; - - if (memory[position].addr != free_area) goto error; + /* Since we passed an offset pointer to the caller, get back to the actual allocation */ + struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t)); #ifdef DEBUG - printf(" Position : %d\n", position); + printf("Unmapped Start : %p ...\n", alloc_info); #endif - memory[position].used = 0; + alloc_info->used = 0; #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1160,12 +1245,13 @@ void blas_memory_free(void *free_area){ return; - error: - printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); - #ifdef DEBUG - for (position = 0; position < BUFFERS_PER_THREAD; position++) - printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); + alloc_table = get_memory_table(); + for (position = 0; position < BUFFERS_PER_THREAD; position++){ + if (alloc_table[position]) { + printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); + } + } #endif return; } @@ -1182,14 +1268,20 @@ void blas_memory_free_nolock(void * map_address) { void blas_shutdown(void){ - int pos; + int pos, thread; #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - for (pos = 0; pos < release_pos; pos ++) { - release_info[pos].func(&release_info[pos]); + for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ + struct alloc_t *alloc_info = local_memory_table[thread][pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + alloc_info = (void *)0; + } + } } #ifdef SEEK_ADDRESS @@ -1198,11 +1290,6 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - memory[pos].addr = (void *)0; - memory[pos].used = 0; - } - return; } @@ -1226,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size_t size; BLASULONG buffer; - size = BUFFER_SIZE - PAGESIZE; + size = allocation_block_size - PAGESIZE; buffer = (BLASULONG)sa + GEMM_OFFSET_A; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -1247,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, UNLOCK_COMMAND(&init_lock); #endif - size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + size = MIN((allocation_block_size - PAGESIZE), L2_SIZE); buffer = (BLASULONG)sa + GEMM_OFFSET_A; while (size > 0) { From 28c28ed275df2fd812bcdc75fdc04cdb6d9580b3 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 21 Jun 2018 11:13:57 +0100 Subject: [PATCH 111/935] Fix data races reported by TSAN. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ed20cf5cd..7eff16ce3 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -543,9 +543,9 @@ static __inline struct alloc_t ** get_memory_table() { if (!local_memory_table_pos) { LOCK_COMMAND(&alloc_lock); local_memory_table_pos = next_memory_table_pos++; - UNLOCK_COMMAND(&alloc_lock); if (next_memory_table_pos > MAX_ALLOCATING_THREADS) printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); + UNLOCK_COMMAND(&alloc_lock); # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); From 2aa0a5804e381f89a53fdbef9bd51e8af23c8940 Mon Sep 17 00:00:00 2001 From: oon3m0oo Date: Thu, 21 Jun 2018 17:47:45 +0100 Subject: [PATCH 112/935] Use BLAS rather than CBLAS in test_fork.c (#1626) This is handy for people not using lapack. --- utest/CMakeLists.txt | 2 -- utest/Makefile | 2 -- utest/test_fork.c | 22 +++++++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 77a42d84f..1b426afe7 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -25,7 +25,6 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms -if (NOT NO_CBLAS) if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) set(OpenBLAS_utest_src @@ -34,7 +33,6 @@ set(OpenBLAS_utest_src ) endif() endif() -endif() if (NOT NO_LAPACK) set(OpenBLAS_utest_src diff --git a/utest/Makefile b/utest/Makefile index e071540dc..e40b3c6db 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -17,13 +17,11 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch -ifneq ($(NO_CBLAS), 1) ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) OBJS += test_fork.o endif endif -endif all : run_test diff --git a/utest/test_fork.c b/utest/test_fork.c index 9e0244305..9fc51287c 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -48,11 +48,13 @@ void* xmalloc(size_t n) } } -void check_dgemm(double *a, double *b, double *result, double *expected, int n) +void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; int i; - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, result, n); + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); for(i = 0; i < n * n; ++i) { ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); } @@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n) CTEST(fork, safety) { - int n = 1000; + blasint n = 1000; int i; double *a, *b, *c, *d; @@ -84,8 +86,10 @@ CTEST(fork, safety) // Compute a DGEMM product in the parent process prior to forking to // ensure that the OpenBLAS thread pool is initialized. - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, c, n); + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n); fork_pid = fork(); if (fork_pid == -1) { From 9cf22b7d9129e186a1ee941fbab8e45328c50b61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 13:27:30 +0200 Subject: [PATCH 113/935] Build cblas_iXamin interfaces --- interface/Makefile | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 9b2b93b83..20ec74e9e 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) CSBLAS1OBJS = \ - cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ + cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) CDBLAS1OBJS = \ - cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ + cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ cblas_dgeadd.$(SUFFIX) CCBLAS1OBJS = \ - cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ + cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_ccopy.$(SUFFIX) \ cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ CZBLAS1OBJS = \ - cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ + cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) +cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) From eb71d61c7cb6640e66a5239d1113de8a8c1477df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 13:31:09 +0200 Subject: [PATCH 114/935] Expose CBLAS interface to BLAS extensions iXamin --- cblas.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cblas.h b/cblas.h index 89f78c133..6461f4209 100644 --- a/cblas.h +++ b/cblas.h @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); From 0b2b83d9ed91e5e9234e41b1d41b0a7f21f5234c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 19:41:32 +0200 Subject: [PATCH 115/935] Add support for a user-defined list of dynamic targets --- Makefile.system | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 62ba0e466..4712d9525 100644 --- a/Makefile.system +++ b/Makefile.system @@ -248,7 +248,7 @@ endif ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET -export MACOSX_DEPLOYMENT_TARGET=10.6 +export MACOSX_DEPLOYMENT_TARGET=10.8 endif MD5SUM = md5 -r endif @@ -497,6 +497,14 @@ endif endif endif +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +CCOMMON_OPT += $(XCCOMMON_OPT) +#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= From 1833a6707157abe966f39dcac90530c2461117d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 19:42:15 +0200 Subject: [PATCH 116/935] Add support for a user-defined list of dynamic targets --- driver/others/dynamic.c | 139 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4271c0a0d..d5ed6d164 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -49,6 +49,127 @@ #define EXTERN #endif +#ifdef DYNAMIC_LIST +extern gotoblas_t gotoblas_PRESCOTT; + +#ifdef DYN_ATHLON +extern gotoblas_t gotoblas_ATHLON; +#else +#define gotoblas_ATHLON gotoblas_PRESCOTT +#endif +#ifdef DYN_KATMAI +extern gotoblas_t gotoblas_KATMAI; +#else +#define gotoblas_KATMAI gotoblas_PRESCOTT +#endif +#ifdef DYN_BANIAS +extern gotoblas_t gotoblas_BANIAS; +#else +#define gotoblas_BANIAS gotoblas_PRESCOTT +#endif +#ifdef DYN_COPPERMINE +extern gotoblas_t gotoblas_COPPERMINE; +#else +#define gotoblas_COPPERMINE gotoblas_PRESCOTT +#endif +#ifdef DYN_NORTHWOOD +extern gotoblas_t gotoblas_NORTHWOOD; +#else +#define gotoblas_NORTHWOOD gotoblas_PRESCOTT +#endif +#ifdef DYN_CORE2 +extern gotoblas_t gotoblas_CORE2; +#else +#define gotoblas_CORE2 gotoblas_PRESCOTT +#endif +#ifdef DYN_NEHALEM +extern gotoblas_t gotoblas_NEHALEM; +#else +#define gotoblas_NEHALEM gotoblas_PRESCOTT +#endif +#ifdef DYN_BARCELONA +extern gotoblas_t gotoblas_BARCELONA; +#else +#define gotoblas_BARCELONA gotoblas_PRESCOTT +#endif +#ifdef DYN_ATOM +extern gotoblas_t gotoblas_ATOM; +#else +#define gotoblas_ATOM gotoblas_PRESCOTT +#endif +#ifdef DYN_NANO +extern gotoblas_t gotoblas_NANO; +#else +#define gotoblas_NANO gotoblas_PRESCOTT +#endif +#ifdef DYN_PENRYN +extern gotoblas_t gotoblas_PENRYN; +#else +#define gotoblas_PENRYN gotoblas_PRESCOTT +#endif +#ifdef DYN_DUNNINGTON +extern gotoblas_t gotoblas_DUNNINGTON; +#else +#define gotoblas_DUNNINGTON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON +extern gotoblas_t gotoblas_OPTERON; +#else +#define gotoblas_OPTERON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON_SSE3 +extern gotoblas_t gotoblas_OPTERON_SSE3; +#else +#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT +#endif +#ifdef DYN_BOBCAT +extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_BOBCAT gotoblas_PRESCOTT +#endif +#ifdef DYN_SANDYBRIDGE +extern gotoblas_t gotoblas_SANDYBRIDGE; +#else +#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT +#endif +#ifdef DYN_BULLDOZER +extern gotoblas_t gotoblas_BULLDOZER; +#else +#define gotoblas_BULLDOZER gotoblas_PRESCOTT +#endif +#ifdef DYN_PILEDRIVER +extern gotoblas_t gotoblas_PILEDRIVER; +#else +#define gotoblas_PILEDRIVER gotoblas_PRESCOTT +#endif +#ifdef DYN_STEAMROLLER +extern gotoblas_t gotoblas_STEAMROLLER; +#else +#define gotoblas_STEAMROLLER gotoblas_PRESCOTT +#endif +#ifdef DYN_EXCAVATOR +extern gotoblas_t gotoblas_EXCAVATOR; +#else +#define gotoblas_EXCAVATOR gotoblas_PRESCOTT +#endif +#ifdef DYN_HASWELL +extern gotoblas_t gotoblas_HASWELL; +#else +#define gotoblas_HASWELL gotoblas_PRESCOTT +#endif +#ifdef DYN_ZEN +extern gotoblas_t gotoblas_ZEN; +#else +#define gotoblas_ZEN gotoblas_PRESCOTT +#endif +#ifdef DYN_SKYLAKEX +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_PRESCOTT +#endif + + +#else // not DYNAMIC_LIST EXTERN gotoblas_t gotoblas_KATMAI; EXTERN gotoblas_t gotoblas_COPPERMINE; EXTERN gotoblas_t gotoblas_NORTHWOOD; @@ -108,6 +229,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define gotoblas_ZEN gotoblas_BARCELONA #endif +#endif // DYNAMIC_LIST #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -338,6 +460,23 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } return NULL; + case 6: + if (model == 6) { + // Cannon Lake +#ifndef NO_AVX512 + return &gotoblas_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return &gotoblas_HASWELL; +#else + return &gotblas_SANDYBRIDGE; +#endif + else + return &gotoblas_NEHALEM; +#endif + } + return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake From 01440685379f11f158c5f612cf15fc279eb16c88 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Mon, 25 Jun 2018 13:53:11 +0100 Subject: [PATCH 117/935] Rewrite &= -> = and simplify the initial blocking phase. --- driver/level3/level3_thread.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..ee3e3b9a9 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; - STOP_RPCC(waiting1); - #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - /* Set flag so other threads can access local region of B */ - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + /* Set flag so other threads can access local region of B */ job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; + WMB; + } } /* Get regions of B from other threads and apply kernel */ @@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; WMB; } } From 750162a05f8c6d0d9530955f78e8e6bb138d8df9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Jun 2018 21:02:31 +0200 Subject: [PATCH 118/935] Try gradual fallback for cores not in the dynamic core list --- driver/others/dynamic.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index d5ed6d164..13794207c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -89,11 +89,15 @@ extern gotoblas_t gotoblas_NEHALEM; #endif #ifdef DYN_BARCELONA extern gotoblas_t gotoblas_BARCELONA; +#elif defined(DYN_NEHALEM) +#define gotoblas_BARCELONA gotoblas_NEHALEM #else #define gotoblas_BARCELONA gotoblas_PRESCOTT #endif #ifdef DYN_ATOM extern gotoblas_t gotoblas_ATOM; +elif defined(DYN_NEHALEM) +#define gotoblas_ATOM gotoblas_NEHALEM #else #define gotoblas_ATOM gotoblas_PRESCOTT #endif @@ -124,46 +128,82 @@ extern gotoblas_t gotoblas_OPTERON_SSE3; #endif #ifdef DYN_BOBCAT extern gotoblas_t gotoblas_BOBCAT; +#elif defined(DYN_NEHALEM) +#define gotoblas_BOBCAT gotoblas_NEHALEM #else #define gotoblas_BOBCAT gotoblas_PRESCOTT #endif #ifdef DYN_SANDYBRIDGE extern gotoblas_t gotoblas_SANDYBRIDGE; +#elif defined(DYN_NEHALEM) +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #else #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT #endif #ifdef DYN_BULLDOZER extern gotoblas_t gotoblas_BULLDOZER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_BULLDOZER gotoblas_NEHALEM #else #define gotoblas_BULLDOZER gotoblas_PRESCOTT #endif #ifdef DYN_PILEDRIVER extern gotoblas_t gotoblas_PILEDRIVER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_PILEDRIVER gotoblas_NEHALEM #else #define gotoblas_PILEDRIVER gotoblas_PRESCOTT #endif #ifdef DYN_STEAMROLLER extern gotoblas_t gotoblas_STEAMROLLER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_STEAMROLLER gotoblas_NEHALEM #else #define gotoblas_STEAMROLLER gotoblas_PRESCOTT #endif #ifdef DYN_EXCAVATOR extern gotoblas_t gotoblas_EXCAVATOR; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_EXCAVATOR gotoblas_NEHALEM #else #define gotoblas_EXCAVATOR gotoblas_PRESCOTT #endif #ifdef DYN_HASWELL extern gotoblas_t gotoblas_HASWELL; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_HASWELL gotoblas_NEHALEM #else #define gotoblas_HASWELL gotoblas_PRESCOTT #endif #ifdef DYN_ZEN extern gotoblas_t gotoblas_ZEN; +#elif defined(DYN_HASWELL) +#define gotoblas_ZEN gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_ZEN gotoblas_NEHALEM #else #define gotoblas_ZEN gotoblas_PRESCOTT #endif #ifdef DYN_SKYLAKEX extern gotoblas_t gotoblas_SKYLAKEX; +#elif defined(DYN_HASWELL) +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SKYLAKEX gotoblas_NEHALEM #else #define gotoblas_SKYLAKEX gotoblas_PRESCOTT #endif From 092175cfec7d49d40904aeff1d8121acb4ed1452 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 08:09:52 +0200 Subject: [PATCH 119/935] Revert changes to NOFORTRAN handling from 952541e --- Makefile | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 49dab6484..56b4426f8 100644 --- a/Makefile +++ b/Makefile @@ -21,17 +21,6 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif -ifeq ($(NO_FORTRAN), 1) -define NOFORTRAN -1 -endef -define NO_LAPACK -1 -endef -export NOFORTRAN -export NO_LAPACK -endif - LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench @@ -58,7 +47,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +108,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +210,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,10 +231,7 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : - $(info filter value of NOFORTRAN is:) - $(info x$(filter-out $(NOFORTRAN), 1 2)x) - -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -288,21 +274,21 @@ endif endif large.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From e322a951febc933e0bae192dcb117e447df24050 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:44:13 +0200 Subject: [PATCH 120/935] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/cdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index e5a6e4d35..fd86a37b0 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cdot_kernel_L999 - cmp INC_X, #0 - beq cdot_kernel_L999 +# cmp INC_X, #0 +# beq cdot_kernel_L999 - cmp INC_Y, #0 - beq cdot_kernel_L999 +# cmp INC_Y, #0 +# beq cdot_kernel_L999 cmp INC_X, #1 bne cdot_kernel_S_BEGIN From 545b82efd30e4e0a33cb57bb7c6fb12601a6d3d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:45:00 +0200 Subject: [PATCH 121/935] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/ddot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index fb294d8b4..cc2e485b7 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble ddot_kernel_L999 - cmp INC_X, #0 - beq ddot_kernel_L999 +# cmp INC_X, #0 +# beq ddot_kernel_L999 - cmp INC_Y, #0 - beq ddot_kernel_L999 +# cmp INC_Y, #0 +# beq ddot_kernel_L999 cmp INC_X, #1 bne ddot_kernel_S_BEGIN From e344db269b5b45d08ff4ce60801de0ece0965866 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:45:57 +0200 Subject: [PATCH 122/935] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/sdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index 5f4f424bf..544846258 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -253,11 +253,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble sdot_kernel_L999 - cmp INC_X, #0 - beq sdot_kernel_L999 +# cmp INC_X, #0 +# beq sdot_kernel_L999 - cmp INC_Y, #0 - beq sdot_kernel_L999 +# cmp INC_Y, #0 +# beq sdot_kernel_L999 cmp INC_X, #1 bne sdot_kernel_S_BEGIN From b83e4c60c73e80269e84b46590005d622d05e6d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:46:42 +0200 Subject: [PATCH 123/935] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/zdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 43f2c0c0b..c0cd92d3c 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -218,11 +218,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zdot_kernel_L999 - cmp INC_X, #0 - beq zdot_kernel_L999 +# cmp INC_X, #0 +# beq zdot_kernel_L999 - cmp INC_Y, #0 - beq zdot_kernel_L999 +# cmp INC_Y, #0 +# beq zdot_kernel_L999 cmp INC_X, #1 bne zdot_kernel_S_BEGIN From 8396e9e7774537b95ea1409f90d6e98d5d5a6800 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Jun 2018 00:00:27 +0200 Subject: [PATCH 124/935] Handle NOFORTRAN=0 --- Makefile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 49dab6484..9a7a25bfc 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +119,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,10 +242,10 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : - $(info filter value of NOFORTRAN is:) - $(info x$(filter-out $(NOFORTRAN), 1 2)x) + $(info filter value of x$(NOFORTRAN)x is:) + $(info x$(filter 0,$(NOFORTRAN))x) -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -288,21 +288,21 @@ endif endif large.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From cc92257ea6f53fd1e315af08f5981686212a4781 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Jun 2018 00:09:21 +0200 Subject: [PATCH 125/935] Update Makefile --- Makefile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Makefile b/Makefile index 9b9a1f795..b947c1198 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif +ifeq ($(NO_FORTRAN), 1) +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench From f0a8dc2eec86a20a1486034a999c36709e699266 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 11:34:48 +0200 Subject: [PATCH 126/935] Disable the AVX512 DGEMM kernel for now due to #1643 --- kernel/x86_64/KERNEL.SKYLAKEX | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index c273ff8cd..2deb41b08 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -4,16 +4,16 @@ SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S DTRMMKERNEL = ../generic/trmmkernel_16x2.c -DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S +#DGEMMINCOPY = ../generic/gemm_ncopy_16.c +#DGEMMITCOPY = ../generic/gemm_tcopy_16.c +#DGEMMONCOPY = ../generic/gemm_ncopy_2.c +#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMM_BETA = ../generic/gemm_beta.c -DGEMM_BETA = ../generic/gemm_beta.c \ No newline at end of file +DGEMM_BETA = ../generic/gemm_beta.c From 6e54b0a027437303e425382c7e5611c1e860632f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 17:31:06 +0200 Subject: [PATCH 127/935] Disable the 16x2 DTRMM kernel on SkylakeX as well --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 2deb41b08..1256f4c3c 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -3,7 +3,7 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S -DTRMMKERNEL = ../generic/trmmkernel_16x2.c +#DTRMMKERNEL = ../generic/trmmkernel_16x2.c #DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S #DGEMMINCOPY = ../generic/gemm_ncopy_16.c #DGEMMITCOPY = ../generic/gemm_tcopy_16.c From f5243e8e1fc585147e8b6e1553232f5f868eff1d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 23:47:44 +0200 Subject: [PATCH 128/935] Add compiler option to avx512 test and hide test output --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index cc64c16c6..3831d7aa3 100644 --- a/c_check +++ b/c_check @@ -205,8 +205,8 @@ $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; print $tmpf "int main(void){ __asm__ volatile($code); }\n"; - $args = " -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); + $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $no_avx512 = 1; From 4e9c34018e06615ea2c0c64551691e297682e7a3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 23:57:50 +0200 Subject: [PATCH 129/935] Fix apparent off-by-one error in calculation of MAX_ALLOCATING_THREADS fixes #1641 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 7eff16ce3..98bcfb216 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -497,7 +497,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #if defined(SMP) && !defined(USE_OPENMP) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 static int next_memory_table_pos = 0; # if defined(HAS_COMPILER_TLS) /* Use compiler generated thread-local-storage */ From 7a914347c56855933cf14a8f50182d95fa619cb4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Jul 2018 11:58:57 +0200 Subject: [PATCH 130/935] remove dev suffix from version number --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 649aabe70..e0f48397f 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.1.dev +VERSION = 0.3.1 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From e6d77111990662ae95cdbe5d8e3e203143deb996 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Jul 2018 11:59:47 +0200 Subject: [PATCH 131/935] remove dev suffix from version number --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a2421ac54..ae95734cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 1.dev) +set(OpenBLAS_PATCH_VERSION 1) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 1392eba488b70c8fb7156ef506037adb1979faf3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Jul 2018 12:01:16 +0200 Subject: [PATCH 132/935] set version number to 0.3.2.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a2421ac54..1bc570961 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 1.dev) +set(OpenBLAS_PATCH_VERSION 2.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From bbf212497062827e4e6d98025f22c2fc47afd918 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Jul 2018 12:01:51 +0200 Subject: [PATCH 133/935] set version number to 0.3.2.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 649aabe70..a3f3b23b9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.1.dev +VERSION = 0.3.2.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 9d15a3bd16d5548701474d6ecf618b669a4ff394 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Jul 2018 14:40:41 +0200 Subject: [PATCH 134/935] Fix typo that broke compilation with DYNAMIC_ARCH and NO_AVX2 fixes 1659 --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 13794207c..d727f1045 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){ #ifndef NO_AVX2 return &gotoblas_HASWELL; #else - return &gotblas_SANDYBRIDGE; + return &gotoblas_SANDYBRIDGE; #endif else return &gotoblas_NEHALEM; From b74aef28165c058cc11c74bc5a7b00ddfce15b31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 14:41:44 +0200 Subject: [PATCH 135/935] Add -march=skylake-avx512 to AVX512 compile check and suppress its output --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index a565fc0d5..d339a755f 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -68,7 +68,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() From a49203b48c4a3d6f86413fc8c4b1fbfaa1946463 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 17:35:54 +0200 Subject: [PATCH 136/935] Double MAX_ALLOCATING_THREADS to fix segfaults with Go and Octave for #1641 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..a8b76a85a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -497,7 +497,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #if defined(SMP) && !defined(USE_OPENMP) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 static int next_memory_table_pos = 0; # if defined(HAS_COMPILER_TLS) /* Use compiler generated thread-local-storage */ From 3f73e8b8cfcfb9c5fb40b75dd5e4435487db0655 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 21:01:35 +0200 Subject: [PATCH 137/935] Add cpuid for AMD Ryzen 2 for #1664 --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 89eb809b0..512ad877b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1452,6 +1452,8 @@ int get_cpuname(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // AMD Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; From d0ec4325cf2b5bf5b9a11c3f173f7ef2dd10d79e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 21:03:24 +0200 Subject: [PATCH 138/935] Add cpuid for AMD Ryzen 2 --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 13794207c..f72902411 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1) { + if (model == 1 || model == 8) { if(support_avx()) return &gotoblas_ZEN; else{ From 5f2a3c05cd0e3872be3c5686b9da6b627658eeb7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Jul 2018 21:42:28 +0200 Subject: [PATCH 139/935] Revert "Rewrite &= -> = and simplify the initial blocking phase." --- driver/level3/level3_thread.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ee3e3b9a9..aeb5e6ed4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; - STOP_RPCC(waiting1); - /* Set flag so other threads can access local region of B */ + /* Set flag so other threads can access local region of B */ + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; - } + WMB; } /* Get regions of B from other threads and apply kernel */ @@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } From 045fb5ea2c5b3e64e0ed747d4227ee8f1063ca05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 07:30:58 +0200 Subject: [PATCH 140/935] Define snprintf for older versions of MSVC for #1677 --- driver/others/openblas_get_config.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 87a27712f..ecafa16c4 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if defined(_WIN32) && defined(_MSC_VER) +#if _MSC_VER < 1900 +#define snprintf _snprintf_s +#endif +#endif + static char* openblas_config_str="" #ifdef USE64BITINT "USE64BITINT " From 571e9de2ac77d838ba47bb7ec6981c7a5b5e68d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 11:42:25 +0200 Subject: [PATCH 141/935] Fix definition of snprintf for MSVC MS _snprintf_s takes an additional argument for the size of the buffer, so is not a direct replacement (utest/ctest.h from which I copied was wrong) --- driver/others/openblas_get_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index ecafa16c4..3e87f2cc2 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(_WIN32) && defined(_MSC_VER) #if _MSC_VER < 1900 -#define snprintf _snprintf_s +#define snprintf _snprintf #endif #endif From 1309711e243ee945908b0c6139e9ea35c12e97f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 11:47:52 +0200 Subject: [PATCH 142/935] Fix declaration of snprintf for older MSVC _snprintf_s takes an additional (size) argument, so is no direct replacement. (Note that this code is currently unused - the two instances of snprintf here are within ifdef blocks that are not compiled for MSVC) --- utest/ctest.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/ctest.h b/utest/ctest.h index 1deea32f6..f297dafba 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -84,7 +84,7 @@ struct ctest { #endif #if _MSC_VER < 1900 -#define snprintf _snprintf_s +#define snprintf _snprintf #endif #ifndef __cplusplus From 8d5b33b6be7877d5df3f120d800f25cf900ee4c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jul 2018 23:39:00 +0200 Subject: [PATCH 143/935] Add cpu identification via mfpvr call for the BSDs fixes #1671 --- cpuid_power.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cpuid_power.c b/cpuid_power.c index 951204ae9..6c7baef4a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -142,6 +142,52 @@ int detect(void){ return CPUTYPE_PPC970; #endif + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +int id; +id = __asm __volatile("mfpvr %0" : "=r"(id)); +switch ( id >> 16 ) { + case 0x4e: // POWER9 + return return CPUTYPE_POWER8; + break; + case 0x4d: + case 0x4b: // POWER8/8E + return CPUTYPE_POWER8; + break; + case 0x4a: + case 0x3f: // POWER7/7E + return CPUTYPE_POWER6; + break; + case 0x3e: + return CPUTYPE_POWER6; + break; + case 0x3a: + return CPUTYPE_POWER5; + break; + case 0x35: + case 0x38: // POWER4 /4+ + return CPUTYPE_POWER4; + break; + case 0x40: + case 0x41: // POWER3 /3+ + return CPUTYPE_POWER3; + break; + case 0x39: + case 0x3c: + case 0x44: + case 0x45: + return CPUTYPE_PPC970; + break; + case 0x70: + return CPUTYPE_CELL; + break; + case 0x8003: + return CPUTYPE_PPCG4; + break; + default: + return CPUTYPE_UNKNOWN; + } +#endif } void get_architecture(void){ From 2fbfc64da8d4850bd2d1ba76c873b4b79acbac3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jul 2018 17:09:55 +0200 Subject: [PATCH 144/935] Use C kernels for default c/zAXPY, xROT, c/zSWAP --- kernel/mips64/KERNEL | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 6afb2cf13..57251d3df 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,3 +1,12 @@ +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zwap.c + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif From d2142760e0a50a7b268fc64e7c4657449b1e7c0b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jul 2018 17:11:40 +0200 Subject: [PATCH 145/935] Fix precision problem in DSDOT --- kernel/mips64/dot.S | 169 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 159 insertions(+), 10 deletions(-) diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S index cb6fbe99c..a645495f4 100644 --- a/kernel/mips64/dot.S +++ b/kernel/mips64/dot.S @@ -103,35 +103,83 @@ .align 3 .L12: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 8 * SIZE(X) LD b1, 8 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 9 * SIZE(X) LD b2, 9 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 10 * SIZE(X) LD b3, 10 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 11 * SIZE(X) LD b4, 11 * SIZE(Y) @@ -143,29 +191,77 @@ .align 3 .L13: +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif daddiu X, X, 8 * SIZE +#ifdef DSDOT +cvt.d.s a2, a2 +cvt.d.s b2, b2 +madd.d s2, s2, a2, b2 +#else MADD s2, s2, a2, b2 +#endif daddiu Y, Y, 8 * SIZE +#ifdef DSDOT +cvt.d.s a3, a3 +cvt.d.s b3, b3 +madd.d s1, s1, a3, b3 +#else MADD s1, s1, a3, b3 +#endif +#ifdef DSDOT +cvt.d.s a4, a4 +cvt.d.s b4, b4 +madd.d s2, s2, a4, b4 +#else MADD s2, s2, a4, b4 +#endif .align 3 .L15: @@ -179,8 +275,13 @@ LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif daddiu I, I, -1 daddiu X, X, SIZE @@ -225,50 +326,85 @@ LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 - +#endif LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) @@ -277,7 +413,13 @@ daddiu I, I, -1 bgtz I, .L23 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s2, s2, a1, b1 +#else MADD s2, s2, a1, b1 +#endif .align 3 .L25: @@ -296,13 +438,20 @@ daddiu I, I, -1 bgtz I, .L26 +#ifdef DSDOT +cvt.d.s a1, a1 +cvt.d.s b1, b1 +madd.d s1, s1, a1, b1 +#else MADD s1, s1, a1, b1 +#endif .align 3 .L999: - ADD s1, s1, s2 #ifdef DSDOT - cvt.d.s s1, s1 + add.d s1, s1, s2 +#else + ADD s1, s1, s2 #endif j $31 NOP From 4e103c822cfd30c8de17ed86b0a1b0c314e6936b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 16 Jul 2018 12:56:39 +0200 Subject: [PATCH 146/935] typo fix --- kernel/mips64/KERNEL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 57251d3df..e257dcfc9 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -5,7 +5,7 @@ DROTKERNEL = ../mips/rot.c CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c -ZSWAPKERNEL = ../mips/zwap.c +ZSWAPKERNEL = ../mips/zswap.c ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S From b14f44d2adbe1ec8ede0cdf06fb8b09f3c4b6e43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Jul 2018 08:57:56 +0200 Subject: [PATCH 147/935] Temporarily disable special handling of OPENMP thread memory allocation for issue #1673 --- driver/others/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..772c1f232 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP +#ifdef USE_OPENMP_UNUSED #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #else #define BUFFERS_PER_THREAD NUM_BUFFERS @@ -363,7 +363,7 @@ int blas_get_cpu_number(void){ #endif // blas_goto_num = 0; -#ifndef USE_OPENMP +#ifndef USE_OPENMP_UNUSED blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -494,7 +494,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #endif /* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 @@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL; /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); @@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0; /* 2 : Thread */ static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP) +#if defined(SMP) && !defined(USE_OPENMP_UNUSED) next_memory_table_pos = 0; # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) From 43ac839c168c652e52320267b0504e6933cb9f60 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Jul 2018 09:19:19 +0200 Subject: [PATCH 148/935] Unset memory table entry, not just the temporary pointer to it on shutdown to fix crash with multiple instances of OpenBLAS, #1692 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..3bf6ba019 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1279,7 +1279,7 @@ void blas_shutdown(void){ struct alloc_t *alloc_info = local_memory_table[thread][pos]; if (alloc_info) { alloc_info->release_func(alloc_info); - alloc_info = (void *)0; + local_memory_table[thread][pos] = (void *)0; } } } From 73131fa30ac40029b51f49356cd0f1349a815e79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Jul 2018 17:46:33 +0200 Subject: [PATCH 149/935] Do not treat WIndows UWB builds as cross-compiling --- cmake/prebuild.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b783ef90d..f29bc3a75 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -85,7 +85,7 @@ if (NOT NOFORTRAN) endif () # Cannot run getarch on target if we are cross-compiling -if (DEFINED CORE AND CMAKE_CROSSCOMPILING) +if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) # Write to config as getarch would # TODO: Set up defines that getarch sets up based on every other target From 2cc8fb0ad2828ca52bd06609fa461c71da66640b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Jul 2018 08:22:38 +0200 Subject: [PATCH 150/935] Set version to 0.3.3.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b8602da96..97f8adeda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 2) +set(OpenBLAS_PATCH_VERSION 3.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From b03ae3f4dc90a3bde83b98f3fd67e8c618c0390b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Jul 2018 08:23:13 +0200 Subject: [PATCH 151/935] Set version to 0.3.3.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index c205c0c1c..2912bab94 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.2 +VERSION = 0.3.3.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 8ebf541e97e7c0573cde6a51353a47e3c509ca00 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Mon, 30 Jul 2018 15:18:29 -0500 Subject: [PATCH 152/935] Set EXPORT_NAME to match OpenBLASConfig.cmake --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97f8adeda..4dc18da99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,6 +169,7 @@ endif() # Set output for libopenblas set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") +set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) From 6400868e553f03110890882c78f9f38ee69e4615 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 2 Aug 2018 16:21:19 +0100 Subject: [PATCH 153/935] Fix #1705 where we incorrectly calculate page locations. Since we now use an allocation size that isn't a multiple of PAGESIZE, finding the pages for run_bench wasn't terminating properly. Now we detect if we've found enough pages for the allocation and terminate the loop. --- driver/others/memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..044c7d7d8 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -637,7 +637,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { static void *alloc_mmap(void *address){ void *map_address, *best_address; - BLASULONG best, start, current; + BLASULONG best, start, current, original; BLASULONG allocsize; if (address){ @@ -685,8 +685,9 @@ static void *alloc_mmap(void *address){ start = (BLASULONG)map_address; current = (SCALING - 1) * allocation_block_size; + original = current; - while(current > 0) { + while(current > 0 && current <= original) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; From 6463bffd593d0b5346482dd3a35b7558fc056868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Mizsei?= Date: Thu, 2 Aug 2018 20:49:14 +0200 Subject: [PATCH 154/935] Haiku supporting patches --- Makefile | 4 +++- Makefile.install | 2 +- c_check | 1 + common.h | 4 ++++ ctest.c | 4 ++++ driver/others/blas_server.c | 2 +- driver/others/memory.c | 12 ++++++++++++ exports/Makefile | 2 +- 8 files changed, 27 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index b947c1198..d99521b19 100644 --- a/Makefile +++ b/Makefile @@ -97,7 +97,7 @@ endif shared : ifndef NO_SHARED -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) @@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN) ifdef SMP ifeq ($(OSNAME), WINNT) -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +else ifeq ($(OSNAME), Haiku) + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc else -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc endif diff --git a/Makefile.install b/Makefile.install index c51c8a021..fa657beba 100644 --- a/Makefile.install +++ b/Makefile.install @@ -66,7 +66,7 @@ endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ diff --git a/c_check b/c_check index 3831d7aa3..8f6296d6c 100644 --- a/c_check +++ b/c_check @@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); +$os = Haiku if ($data =~ /OS_HAIKU/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); diff --git a/common.h b/common.h index 663f37e7b..d87b234ae 100644 --- a/common.h +++ b/common.h @@ -105,6 +105,10 @@ extern "C" { #endif #endif +#ifdef OS_HAIKU +#define NO_SYSV_IPC +#endif + #ifdef OS_WINDOWS #ifdef ATOM #define GOTO_ATOM ATOM diff --git a/ctest.c b/ctest.c index 00be423d1..0571e9e02 100644 --- a/ctest.c +++ b/ctest.c @@ -101,6 +101,10 @@ OS_INTERIX OS_LINUX #endif +#if defined(__HAIKU__) +OS_HAIKU +#endif + #if defined(__i386) || defined(_X86) ARCH_X86 #endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 794dfb20e..1d7f570d8 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) #include #include #include diff --git a/driver/others/memory.c b/driver/others/memory.c index a27d9001f..c4bd9b73c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -108,6 +108,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#ifdef OS_HAIKU +#include +#endif + #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include @@ -238,6 +242,14 @@ int get_num_procs(void) { } #endif +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + #ifdef OS_WINDOWS int get_num_procs(void) { diff --git a/exports/Makefile b/exports/Makefile index 127b05057..29075a9c2 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -122,7 +122,7 @@ endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) so : ../$(LIBSONAME) From 2a9a9389ef6c3da56fda859c9586becc41a2e780 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 2 Aug 2018 14:58:52 -0500 Subject: [PATCH 155/935] Added target_include_directories() --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4dc18da99..20ce02e87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,6 +150,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) From a4e321400b534d7f547cad8127ff6da45e1ac872 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 3 Aug 2018 13:00:10 -0400 Subject: [PATCH 156/935] fabs -> fabsl Fixes two calls that were using `fabs` on a `long double` argument rather than `fabsl`, which looks like it is doing an unintentional truncation to `double` precision. --- interface/rotg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/rotg.c b/interface/rotg.c index 092554299..69443a5a0 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double s; long double r, roe, z; - long double ada = fabs(da); - long double adb = fabs(db); + long double ada = fabsl(da); + long double adb = fabsl(db); long double scale = ada + adb; #ifndef CBLAS From 933896a1d0b284e28f742e7b73e8129b80dba43b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Aug 2018 20:06:49 +0200 Subject: [PATCH 157/935] Use blasabs to switch between abs and labs as needed for INTERFACE64 --- interface/gbmv.c | 2 +- interface/gemv.c | 2 +- interface/sbmv.c | 2 +- interface/spmv.c | 2 +- interface/symv.c | 2 +- interface/zgbmv.c | 2 +- interface/zgemv.c | 2 +- interface/zhbmv.c | 2 +- interface/zhemv.c | 2 +- interface/zhpmv.c | 2 +- interface/zsbmv.c | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/interface/gbmv.c b/interface/gbmv.c index 096c9f6f2..1d58ba807 100644 --- a/interface/gbmv.c +++ b/interface/gbmv.c @@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans) lenx = m; if (trans) leny = n; - if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/gemv.c b/interface/gemv.c index 30709e361..c9d52cd69 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans) lenx = m; if (trans) leny = n; - if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/sbmv.c b/interface/sbmv.c index 761a9a0d0..25e99ca34 100644 --- a/interface/sbmv.c +++ b/interface/sbmv.c @@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/spmv.c b/interface/spmv.c index 403458b06..e08ae3f6e 100644 --- a/interface/spmv.c +++ b/interface/spmv.c @@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/symv.c b/interface/symv.c index e4e300e20..07bd20022 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (n == 0) return; - if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; diff --git a/interface/zgbmv.c b/interface/zgbmv.c index a04be2fbf..5e275a8ed 100644 --- a/interface/zgbmv.c +++ b/interface/zgbmv.c @@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans & 1) lenx = m; if (trans & 1) leny = n; - if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; diff --git a/interface/zgemv.c b/interface/zgemv.c index 0c75564f0..3e98dba7f 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans & 1) lenx = m; if (trans & 1) leny = n; - if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; diff --git a/interface/zhbmv.c b/interface/zhbmv.c index 9ad1b53a1..656f137c6 100644 --- a/interface/zhbmv.c +++ b/interface/zhbmv.c @@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zhemv.c b/interface/zhemv.c index 2aee880dc..d1996ad69 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zhpmv.c b/interface/zhpmv.c index b72a6d670..ff49716b5 100644 --- a/interface/zhpmv.c +++ b/interface/zhpmv.c @@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order, if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/zsbmv.c b/interface/zsbmv.c index b71d4c519..cd5cefed9 100644 --- a/interface/zsbmv.c +++ b/interface/zsbmv.c @@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * if (n == 0) return; - if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; From 40c068a8750d74d2434709aac3992a8bf80e4734 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Aug 2018 20:07:59 +0200 Subject: [PATCH 158/935] Introduce blasabs() to switch between abs() and labs() for INTERFACE64 --- common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common.h b/common.h index 663f37e7b..0516a57c0 100644 --- a/common.h +++ b/common.h @@ -253,8 +253,10 @@ typedef unsigned long BLASULONG; #ifdef USE64BITINT typedef BLASLONG blasint; +#define blasabs(x) labs(x) #else typedef int blasint; +#define blasabs(x) abs(x) #endif #else #ifdef USE64BITINT From 165f00c159cf0c4e7e6eef8f656fa68e7cda4ea2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Aug 2018 20:14:51 +0200 Subject: [PATCH 159/935] fabs -> fabsl --- interface/zrotg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/zrotg.c b/interface/zrotg.c index 187343d41..8caa411fc 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ long double db_i = *(DB + 1); long double r; - long double ada = fabs(da_r) + fabs(da_i); + long double ada = fabsl(da_r) + fabsl(da_i); PRINT_DEBUG_NAME; From 48610a4524937c8feb857aa0f49f5999edfdd42c Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sun, 5 Aug 2018 08:18:51 -0400 Subject: [PATCH 160/935] fix blasabs for windows Bugfix in #1713 for Windows (LLP64), where `blasabs` needs to be `llabs` rather than `labs` for the 64-bit API. --- common.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common.h b/common.h index 2ab29c3ee..6c3d5b15e 100644 --- a/common.h +++ b/common.h @@ -257,7 +257,11 @@ typedef unsigned long BLASULONG; #ifdef USE64BITINT typedef BLASLONG blasint; +#if defined(OS_WINDOWS) && defined(__64BIT__) +#define blasabs(x) llabs(x) +#else #define blasabs(x) labs(x) +#endif #else typedef int blasint; #define blasabs(x) abs(x) From 73478664d4fb01f93d1810e85e1b7a499288b5bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Aug 2018 16:40:32 +0200 Subject: [PATCH 161/935] Add workaround for avx512 compilations on Cygwin fixes #1708 --- Makefile.x86_64 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 677c05d93..f831b5040 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX) ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +endif endif endif From 23229011db2ab03c7643f1e0a007efc8e0276201 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 6 Aug 2018 18:20:40 +0300 Subject: [PATCH 162/935] [ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization --- Makefile.zarch | 4 + cpuid_zarch.c | 35 +- kernel/zarch/KERNEL.Z13 | 20 +- kernel/zarch/KERNEL.Z14 | 146 +++++++ kernel/zarch/camax.c | 269 +++++++++++++ kernel/zarch/camin.c | 269 +++++++++++++ kernel/zarch/casum.c | 167 ++++++++ kernel/zarch/caxpy.c | 174 +++++++++ kernel/zarch/ccopy.c | 99 +++++ kernel/zarch/cdot.c | 182 +++++++++ kernel/zarch/crot.c | 256 ++++++++++++ kernel/zarch/cscal.c | 456 +++++++++++++++++++++ kernel/zarch/cswap.c | 183 +++++++++ kernel/zarch/damax.c | 206 ++++++++++ kernel/zarch/damin.c | 206 ++++++++++ kernel/zarch/dasum.c | 158 ++++---- kernel/zarch/daxpy.c | 177 ++++----- kernel/zarch/dcopy.c | 122 +----- kernel/zarch/ddot.c | 155 +++----- kernel/zarch/dgemv_n_4.c | 516 ++++++++++++++++-------- kernel/zarch/dgemv_t_4.c | 578 ++++++++++++++++++++------- kernel/zarch/dmax.c | 182 +++++++++ kernel/zarch/dmin.c | 182 +++++++++ kernel/zarch/drot.c | 338 ++++++++-------- kernel/zarch/dscal.c | 200 ++++------ kernel/zarch/dsdot.c | 180 +++++++++ kernel/zarch/dswap.c | 292 ++++---------- kernel/zarch/icamax.c | 319 +++++++++++++++ kernel/zarch/icamin.c | 319 +++++++++++++++ kernel/zarch/idamax.c | 295 +++++++------- kernel/zarch/idamin.c | 325 ++++++++------- kernel/zarch/idmax.c | 232 +++++++++++ kernel/zarch/idmin.c | 232 +++++++++++ kernel/zarch/isamax.c | 299 ++++++++++++++ kernel/zarch/isamin.c | 299 ++++++++++++++ kernel/zarch/ismax.c | 275 +++++++++++++ kernel/zarch/ismin.c | 275 +++++++++++++ kernel/zarch/izamax.c | 334 ++++++++-------- kernel/zarch/izamin.c | 400 +++++++++---------- kernel/zarch/samax.c | 210 ++++++++++ kernel/zarch/samin.c | 210 ++++++++++ kernel/zarch/sasum.c | 174 +++++++++ kernel/zarch/saxpy.c | 184 +++++++++ kernel/zarch/scopy.c | 85 ++++ kernel/zarch/sdot.c | 140 +++++++ kernel/zarch/sgemv_n_4.c | 668 +++++++++++++++++++++++++++++++ kernel/zarch/sgemv_t_4.c | 826 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/smax.c | 186 +++++++++ kernel/zarch/smin.c | 186 +++++++++ kernel/zarch/srot.c | 246 ++++++++++++ kernel/zarch/sscal.c | 201 ++++++++++ kernel/zarch/sswap.c | 164 ++++++++ kernel/zarch/zamax.c | 221 +++++++++++ kernel/zarch/zamin.c | 221 +++++++++++ kernel/zarch/zasum.c | 152 +++---- kernel/zarch/zaxpy.c | 216 +++++----- kernel/zarch/zcopy.c | 86 +--- kernel/zarch/zdot.c | 213 ++++------ kernel/zarch/zrot.c | 339 ++++++++-------- kernel/zarch/zscal.c | 460 ++++++++++------------ kernel/zarch/zswap.c | 291 ++++---------- ztest/Makefile | 437 +++++++++++++++++++++ ztest/amax.c | 235 +++++++++++ ztest/amin.c | 235 +++++++++++ ztest/asum.c | 263 +++++++++++++ ztest/axpy.c | 303 ++++++++++++++ ztest/copy.c | 291 ++++++++++++++ ztest/dot.c | 296 ++++++++++++++ ztest/dsdot.c | 229 +++++++++++ ztest/gemv.c | 618 +++++++++++++++++++++++++++++ ztest/iamax.c | 284 ++++++++++++++ ztest/iamin.c | 284 ++++++++++++++ ztest/imax.c | 231 +++++++++++ ztest/imin.c | 231 +++++++++++ ztest/max.c | 229 +++++++++++ ztest/min.c | 229 +++++++++++ ztest/rot.c | 303 ++++++++++++++ ztest/scal.c | 308 +++++++++++++++ ztest/swap.c | 306 +++++++++++++++ 79 files changed, 17382 insertions(+), 2965 deletions(-) create mode 100644 kernel/zarch/KERNEL.Z14 create mode 100644 kernel/zarch/camax.c create mode 100644 kernel/zarch/camin.c create mode 100644 kernel/zarch/casum.c create mode 100644 kernel/zarch/caxpy.c create mode 100644 kernel/zarch/ccopy.c create mode 100644 kernel/zarch/cdot.c create mode 100644 kernel/zarch/crot.c create mode 100644 kernel/zarch/cscal.c create mode 100644 kernel/zarch/cswap.c create mode 100644 kernel/zarch/damax.c create mode 100644 kernel/zarch/damin.c create mode 100644 kernel/zarch/dmax.c create mode 100644 kernel/zarch/dmin.c create mode 100644 kernel/zarch/dsdot.c create mode 100644 kernel/zarch/icamax.c create mode 100644 kernel/zarch/icamin.c create mode 100644 kernel/zarch/idmax.c create mode 100644 kernel/zarch/idmin.c create mode 100644 kernel/zarch/isamax.c create mode 100644 kernel/zarch/isamin.c create mode 100644 kernel/zarch/ismax.c create mode 100644 kernel/zarch/ismin.c create mode 100644 kernel/zarch/samax.c create mode 100644 kernel/zarch/samin.c create mode 100644 kernel/zarch/sasum.c create mode 100644 kernel/zarch/saxpy.c create mode 100644 kernel/zarch/scopy.c create mode 100644 kernel/zarch/sdot.c create mode 100644 kernel/zarch/sgemv_n_4.c create mode 100644 kernel/zarch/sgemv_t_4.c create mode 100644 kernel/zarch/smax.c create mode 100644 kernel/zarch/smin.c create mode 100644 kernel/zarch/srot.c create mode 100644 kernel/zarch/sscal.c create mode 100644 kernel/zarch/sswap.c create mode 100644 kernel/zarch/zamax.c create mode 100644 kernel/zarch/zamin.c create mode 100644 ztest/Makefile create mode 100644 ztest/amax.c create mode 100644 ztest/amin.c create mode 100644 ztest/asum.c create mode 100644 ztest/axpy.c create mode 100644 ztest/copy.c create mode 100644 ztest/dot.c create mode 100644 ztest/dsdot.c create mode 100644 ztest/gemv.c create mode 100644 ztest/iamax.c create mode 100644 ztest/iamin.c create mode 100644 ztest/imax.c create mode 100644 ztest/imin.c create mode 100644 ztest/max.c create mode 100644 ztest/min.c create mode 100644 ztest/rot.c create mode 100644 ztest/scal.c create mode 100644 ztest/swap.c diff --git a/Makefile.zarch b/Makefile.zarch index 9ec9dc79f..47ea1eb71 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector endif +ifeq ($(CORE), Z14) +CCOMMON_OPT += -march=z14 -mzvector +FCOMMON_OPT += -march=z14 -mzvector +endif diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..0ae32f27d 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,40 +29,25 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) { - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - - return CPU_GENERIC; + // return CPU_GENERIC; + return CPU_Z14; + } void get_libname(void) @@ -107,5 +92,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index add628bfe..d39b9d904 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = damax.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = zamax.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c +DAMINKERNEL = damin.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = zamin.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c +DMAXKERNEL = dmax.c SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c +DMINKERNEL = dmin.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c @@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = izamin.c ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = idmax.c ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c +IDMINKERNEL = idmin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = dasum.c @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c +ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 new file mode 100644 index 000000000..fa88b6881 --- /dev/null +++ b/kernel/zarch/KERNEL.Z14 @@ -0,0 +1,146 @@ +SAMAXKERNEL = samax.c +DAMAXKERNEL = damax.c +CAMAXKERNEL = camax.c +ZAMAXKERNEL = zamax.c + +SAMINKERNEL = samin.c +DAMINKERNEL = damin.c +CAMINKERNEL = camin.c +ZAMINKERNEL = zamin.c + +SMAXKERNEL = smax.c +DMAXKERNEL = dmax.c + +SMINKERNEL = smin.c +DMINKERNEL = dmin.c + +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c + +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +IZAMINKERNEL = izamin.c + +ISMAXKERNEL = ismax.c +IDMAXKERNEL = idmax.c + +ISMINKERNEL = ismin.c +IDMINKERNEL = idmin.c + +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c +DSDOTKERNEL = dsdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c + +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c + +SGEMVNKERNEL = sgemv_n_4.c +DGEMVNKERNEL = dgemv_n_4.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = sgemv_t_4.c +DGEMVTKERNEL = dgemv_t_4.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = strmm8x4V.S +DTRMMKERNEL = trmm8x4V.S +CTRMMKERNEL = ctrmm4x4V.S +ZTRMMKERNEL = ztrmm4x4V.S + +SGEMMKERNEL = strmm8x4V.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + + + +DGEMMKERNEL = gemm8x4V.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ctrmm4x4V.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ztrmm4x4V.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + + diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c new file mode 100644 index 000000000..6394be769 --- /dev/null +++ b/kernel/zarch/camax.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),1 \n\t" + "vlef %%v17,140(%%r1,%2),1 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),1 \n\t" + "vlef %%v19,172(%%r1,%2),1 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),1 \n\t" + "vlef %%v21,204(%%r1,%2),1 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),1 \n\t" + "vlef %%v23,236(%%r1,%2),1 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = camax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c new file mode 100644 index 000000000..936c300c8 --- /dev/null +++ b/kernel/zarch/camin.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),0 \n\t" + "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),0 \n\t" + "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),0 \n\t" + "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),0 \n\t" + "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),0 \n\t" + "vlef %%v17,140(%%r1,%2),0 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),0 \n\t" + "vlef %%v19,172(%%r1,%2),0 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),0 \n\t" + "vlef %%v21,204(%%r1,%2),0 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),0 \n\t" + "vlef %%v23,236(%%r1,%2),0 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = camin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c new file mode 100644 index 000000000..f4ebc21bd --- /dev/null +++ b/kernel/zarch/casum.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = casum_kernel_32(n1, x); + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c new file mode 100644 index 000000000..2176f3dcd --- /dev/null +++ b/kernel/zarch/caxpy.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( +#if !defined(CONJ) + "vlrepf %%v0,0(%3) \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v1,4(%3),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%3),1 \n\t" + "vlef %%v1,4(%3),3 \n\t" +#else + "vlef %%v0,0(%3),1 \n\t" + "vlef %%v0,0(%3),3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v0,0(%3),2 \n\t" + "vlrepf %%v1,4(%3) \n\t" +#endif + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2]; + + if (n <= 0) return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + + } + return (0); + + + } + + inc_x *= 2; + inc_y *= 2; + + while (i < n) { + +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + + } + return (0); + +} + + diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c new file mode 100644 index 000000000..fc0b8d648 --- /dev/null +++ b/kernel/zarch/ccopy.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,5 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + ccopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + + return(0); +} diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c new file mode 100644 index 000000000..3eda2979b --- /dev/null +++ b/kernel/zarch/cdot.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v24,%%v24,%%v28 \n\t" + "vfasb %%v24,%%v24,%%v30 \n\t" + "vrepg %%v26,%%v24,1 \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vfasb %%v25,%%v25,%%v29 \n\t" + "vfasb %%v25,%%v25,%%v31 \n\t" + "vrepg %%v27,%%v25,1 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vstef %%v24,0(%3),0 \n\t" + "vstef %%v24,4(%3),1 \n\t" + "vstef %%v25,8(%3),1 \n\t" + "vstef %%v25,12(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) + cdot_kernel_16(n1, x, y, dot); + + i = n1; + BLASLONG j = i * 2; + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} + + diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c new file mode 100644 index 000000000..f04a624ac --- /dev/null +++ b/kernel/zarch/crot.c @@ -0,0 +1,256 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c new file mode 100644 index 000000000..0c15c5add --- /dev/null +++ b/kernel/zarch/cscal.c @@ -0,0 +1,456 @@ +/*************************************************************************** +Copyright (c) 2013 - 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "vlef %%v1,4(%1),0 \n\t" + "vlef %%v1,4(%1),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%1),1 \n\t" + "vlef %%v1,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + "verllg %%v28,%%v20,32 \n\t" + "verllg %%v29,%%v21,32 \n\t" + "verllg %%v30,%%v22,32 \n\t" + "verllg %%v31,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlef %%v0,4(%1),0 \n\t" + "vlef %%v0,4(%1),2 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,4(%1),1 \n\t" + "vlef %%v0,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v16,%%v16,32 \n\t" + "verllg %%v17,%%v17,32 \n\t" + "verllg %%v18,%%v18,32 \n\t" + "verllg %%v19,%%v19,32 \n\t" + "verllg %%v20,%%v20,32 \n\t" + "verllg %%v21,%%v21,32 \n\t" + "verllg %%v22,%%v22,32 \n\t" + "verllg %%v23,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) + { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); + + if (inc_x != 1) { + inc_x <<= 1; + + if (da_r == 0.0) { + + BLASLONG n1 = n & -2; + + if (da_i == 0.0) { + + while (j < n1) { + + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; + + } + + } else { + + while (j < n1) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + + + } + + } else { + + + if (da_i == 0.0) { + BLASLONG n1 = n & -2; + + while (j < n1) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } else { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } + + } + + return (0); + } + + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + alpha[0] = da_r; + alpha[1] = da_i; + + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else + if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + + i = n1 << 1; + j = n1; + } + + + if (da_r == 0.0) { + + if (da_i == 0.0) { + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } else { + + if (da_i == 0.0) { + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } + + return (0); +} diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c new file mode 100644 index 000000000..256995d50 --- /dev/null +++ b/kernel/zarch/cswap.c @@ -0,0 +1,183 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + cswap_kernel_32(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c new file mode 100644 index 000000000..b74af5d37 --- /dev/null +++ b/kernel/zarch/damax.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c new file mode 100644 index 000000000..4cf5e88b1 --- /dev/null +++ b/kernel/zarch/damin.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 7a42a0863..fea431c34 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" #include @@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabsf #endif - - - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum ; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_temp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_temp],256(%[ptr_temp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "clgrjl %[ptr_temp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v2,%%v3 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum],%%f0 \n\t" - : [asum] "=f"(asum),[ptr_temp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; - +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; } - - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 16f82a587..e8823745e 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" -#define PREFETCH_INS 1 -#if defined(Z13_A) -#include - -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) -{ - BLASLONG i = 0; - __vector double v_a = {alpha,alpha}; - __vector double * v_y=(__vector double *)y; - __vector double * v_x=(__vector double *)x; - - for(; i -#endif - -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double v_x2 = {x2,x2}; - __vector double v_x3 = {x3,x3}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; - } -} - -#else - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i<4; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC - -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } + __asm__ volatile ( + "vlrepg %%v0,0(%5) \n\t" + "vlrepg %%v1,8(%5) \n\t" + "vlrepg %%v2,16(%5) \n\t" + "vlrepg %%v3,24(%5) \n\t" + "vlrepg %%v4,%7 \n\t" + "vfmdb %%v0,%%v0,%%v4 \n\t" + "vfmdb %%v1,%%v1,%%v4 \n\t" + "vfmdb %%v2,%%v2,%%v4 \n\t" + "vfmdb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0,*a1; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - - for ( i=0; i<2; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0; - x0 = xo[0] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } + __asm__ volatile ( + "vlrepg %%v0,0(%3) \n\t" + "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v2,%5 \n\t" + "vfmdb %%v0,%%v0,%%v2 \n\t" + "vfmdb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap; - - for ( i=0; i<1; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + __asm__ volatile ( + "vlrepg %%v0,0(%2) \n\t" + "vlrepg %%v1,%4 \n\t" + "vfmdb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - -#endif - - - static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; - - for ( i=0; i -#endif #define NBMAX 2048 -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - __vector double temp2 = {0,0}; - __vector double temp3 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; - y[2] = temp2[0] + temp2[1]; - y[3] = temp3[0] + temp3[1];; -} -#else -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; - temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - y[2] = temp2; - y[3] = temp3; + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "adbr %%f0,%%f4 \n\t" + "std %%f0,0(%6) \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "adbr %%f1,%%f4 \n\t" + "std %%f1,8(%6) \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "adbr %%f2,%%f4 \n\t" + "std %%f2,16(%6) \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "adbr %%f3,%%f4 \n\t" + "std %%f3,24(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; -} -#else -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "adbr %%f0,%%f2 \n\t" + "std %%f0,0(%4) \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "adbr %%f1,%%f2 \n\t" + "std %%f1,8(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)a0; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "std %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; - - - FLOAT temp0 = 0.0; - - for ( i=0; i< n; i+=4 ) + for (i = 0; i < n; i++) { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + dest[i] = *src; + src += inc_src; } - y[0] = temp0; } -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { - BLASLONG i; - for ( i=0; i 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c new file mode 100644 index 000000000..d7c86735f --- /dev/null +++ b/kernel/zarch/dmin.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index bf29538c7..c91f95800 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc", "r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -32; if ( n1 > 0 ) { - - drot_kernel_32(n1, x, y, c, s); + FLOAT cosa,sina; + cosa=c; + sina=s; + drot_kernel_32(n1, x, y, &cosa, &sina); i=n1; } @@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index e29f51012..ccc6dd95d 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#ifdef Z13_A -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) -{ - - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "srlg %[n],%[n],4 \n\t" - "vlr %%v1,%%v0 \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "la %[x_ptr], 128(%[x_ptr]) \n\t" - "aghik %[n], %[n], -1 \n\t" - "jle 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v0 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" - "lay %[x_ptr], -128(%[x_ptr]) \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "brctg %[n],1b \n\t" - "2: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v1 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "lay %[x_ptr] , -128(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) - : [alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - } -#else -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__ volatile ( + "vlrepg %%v0,%1 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} - /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "vlr %%v1,%%v0 \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v1 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v1 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v1 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v1 \n\t" - "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vfmdb %%v25,%%v25,%%v1 \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vfmdb %%v27,%%v27,%%v1 \n\t" - "vfmdb %%v28,%%v28,%%v0 \n\t" - "vfmdb %%v29,%%v29,%%v1 \n\t" - "vfmdb %%v30,%%v30,%%v0 \n\t" - "vfmdb %%v31,%%v31,%%v1 \n\t" - "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" - "la %[x_ptr], 256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - } -#endif -static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "sllg %%r0,%[n],3 \n\t" - "vzero %%v25 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vst %%v24, 32(%[x_ptr]) \n\t" - "vst %%v25, 48(%[x_ptr]) \n\t" - "vst %%v24, 64(%[x_ptr]) \n\t" - "vst %%v25, 80(%[x_ptr]) \n\t" - "vst %%v24, 96(%[x_ptr]) \n\t" - "vst %%v25, 112(%[x_ptr]) \n\t" - "vst %%v24, 128(%[x_ptr]) \n\t" - "vst %%v25, 144(%[x_ptr]) \n\t" - "vst %%v24, 160(%[x_ptr]) \n\t" - "vst %%v25, 176(%[x_ptr]) \n\t" - "vst %%v24, 192(%[x_ptr]) \n\t" - "vst %%v25, 208(%[x_ptr]) \n\t" - "vst %%v24, 224(%[x_ptr]) \n\t" - "vst %%v25, 240(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" , "r0", "v24" ,"v25" - ); + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32_zero(n1 , x); + dscal_kernel_16_zero(n1, x); j=n1; } @@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32(n1 , da , x); + dscal_kernel_16(n1, da, x); j=n1; } while(j < n) @@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } return 0; -} \ No newline at end of file +} + + diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c new file mode 100644 index 000000000..17461a029 --- /dev/null +++ b/kernel/zarch/dsdot.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + double dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmsb %%v16,%%v16,%%v24 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmsb %%v17,%%v17,%%v25 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmsb %%v18,%%v18,%%v26 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmsb %%v19,%%v19,%%v27 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmsb %%v20,%%v20,%%v28 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmsb %%v21,%%v21,%%v29 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmsb %%v22,%%v22,%%v30 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmsb %%v23,%%v23,%%v31 \n\t" + + "vflls %%v24,%%v16 \n\t" + "vflls %%v25,%%v17 \n\t" + "vflls %%v26,%%v18 \n\t" + "vflls %%v27,%%v19 \n\t" + "vflls %%v28,%%v20 \n\t" + "vflls %%v29,%%v21 \n\t" + "vflls %%v30,%%v22 \n\t" + "vflls %%v31,%%v23 \n\t" + + "veslg %%v16,%%v16,32 \n\t" + "veslg %%v17,%%v17,32 \n\t" + "veslg %%v18,%%v18,32 \n\t" + "veslg %%v19,%%v19,32 \n\t" + "veslg %%v20,%%v20,32 \n\t" + "veslg %%v21,%%v21,32 \n\t" + "veslg %%v22,%%v22,32 \n\t" + "veslg %%v23,%%v23,32 \n\t" + + "vflls %%v16,%%v16 \n\t" + "vflls %%v17,%%v17 \n\t" + "vflls %%v18,%%v18 \n\t" + "vflls %%v19,%%v19 \n\t" + "vflls %%v20,%%v20 \n\t" + "vflls %%v21,%%v21 \n\t" + "vflls %%v22,%%v22 \n\t" + "vflls %%v23,%%v23 \n\t" + + "vfadb %%v16,%%v16,%%v24 \n\t" + "vfadb %%v17,%%v17,%%v25 \n\t" + "vfadb %%v18,%%v18,%%v26 \n\t" + "vfadb %%v19,%%v19,%%v27 \n\t" + "vfadb %%v20,%%v20,%%v28 \n\t" + "vfadb %%v21,%%v21,%%v29 \n\t" + "vfadb %%v22,%%v22,%%v30 \n\t" + "vfadb %%v23,%%v23,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v20 \n\t" + "vfadb %%v17,%%v17,%%v21 \n\t" + "vfadb %%v18,%%v18,%%v22 \n\t" + "vfadb %%v19,%%v19,%%v23 \n\t" + "vfadb %%v16,%%v16,%%v18 \n\t" + "vfadb %%v17,%%v17,%%v19 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v0,%%v16,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + double dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = dsdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index d7e079147..8070ef41a 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - - #include "common.h" - - -#if defined(Z13_SWAP_A) -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#else - -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; @@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, } - - diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c new file mode 100644 index 000000000..e7f096e0d --- /dev/null +++ b/kernel/zarch/icamax.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = icamax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } +} + + diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c new file mode 100644 index 000000000..b9c1ccd9c --- /dev/null +++ b/kernel/zarch/icamin.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = icamin_kernel_32(n1, x, &minf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } +} + + diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b67091148..aba880949 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vzero %%v5 \n\t" - "vzero %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v26,%%v18 \n\t" - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[maxf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return index; +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; -} + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG max = 0; @@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; if (n1 > 0) { - max = diamax_kernel_32_TUNED(n1, x, &maxf); + max = idamax_kernel_32(n1, x, &maxf); i = n1; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 8a7ff1659..3213efa4d 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vlrepg %%v18,0(%[ptr_x]) \n\t" - "vzero %%v5 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfchdb %%v16,%%v24,%%v25 \n\t " - "vfchdb %%v17,%%v26 ,%%v27 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28, %%v29 \n\t " - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfchdb %%v16,%%v24,%%v25 \n\t" - "vfchdb %%v17,%%v26 ,%%v27 \n\t" - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28 ,%%v29 \n\t" - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v18 ,%%v26 \n\t " - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[minf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; - +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; } - - - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; - BLASLONG min = 0; FLOAT minf = 0.0; - + BLASLONG min = 0; + if (n <= 0 || inc_x <= 0) return (min); - minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant + if (inc_x == 1) { BLASLONG n1 = n & -32; if (n1 > 0) { - min = diamin_kernel_32(n1, x, &minf); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c new file mode 100644 index 000000000..26fff4eb0 --- /dev/null +++ b/kernel/zarch/idmax.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = idmax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c new file mode 100644 index 000000000..570b33a15 --- /dev/null +++ b/kernel/zarch/idmin.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = idmin_kernel_32(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c new file mode 100644 index 000000000..95a665b10 --- /dev/null +++ b/kernel/zarch/isamax.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = isamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c new file mode 100644 index 000000000..640fc02c9 --- /dev/null +++ b/kernel/zarch/isamin.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = isamin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c new file mode 100644 index 000000000..0eb350315 --- /dev/null +++ b/kernel/zarch/ismax.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = ismax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c new file mode 100644 index 000000000..f050db8cb --- /dev/null +++ b/kernel/zarch/ismin.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = ismin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 216c3414a..bf5f621a7 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - - - -/** - * Find maximum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - - - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v6 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfchdb %%v25,%%v1,%%v0 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v3,%%v2 \n\t " - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v26,%%v24 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v30,%%v28 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24, %%v1,%%v31 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30, %%v27,%%v3 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0, %%v31,%%v28 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30, %%v27,%%v6 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v6 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[maxf] \n\t" - "3: \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - return index; - + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG n1 = n & -16; if (n1 > 0) { - max = ziamax_kernel_16_TUNED(n1, x, &maxf); + max = izamax_kernel_16(n1, x, &maxf); + i = n1; - ix = n1 << 1; } while(i < n) @@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return (max + 1); } - } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9b2a653a7..3636e8fdf 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - -/** - * Find minimum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index ; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - "ld %%f6,0(%[ptr_x]) \n\t" - "lpdbr %%f6,%%f6 \n\t" - "ld %%f7,8(%[ptr_x]) \n\t" - "lpdbr %%f7,%%f7 \n\t" - "adbr %%f6,%%f7 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vrepg %%v6,%%v6,0 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - - "vfchdb %%v25,%%v0 ,%%v1 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v2,%%v3 \n\t" - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v24,%%v26 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v28,%%v30 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24,%%v31, %%v1 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30,%%v3, %%v27 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0,%%v28, %%v31 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30,%%v6 , %%v27 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3f \n\t" - "2: \n\t" - "wfchdb %%v16,%%v6 ,%%v26 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[minf] \n\t" - "3: \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(min); - - + if (inc_x == 1) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + BLASLONG n1 = n & -16; + if (n1 > 0) { + + min = izamin_kernel_16(n1, x, &minf); - min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; - ix = n1 << 1; - } - else { - //assign minf - minf = CABS1(x,0); - ix += 2; - i++; - } + } - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += 2; + i++; + } return (min + 1); } else { - inc_x2 = 2 * inc_x; + inc_x2 = 2 * inc_x; - minf = CABS1(x,0); - ix += inc_x2; - i++; + minf = CABS1(x,0); + ix += inc_x2; + i++; - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += inc_x2; + i++; + } return (min + 1); } - } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c new file mode 100644 index 000000000..1025cfcbf --- /dev/null +++ b/kernel/zarch/samax.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + maxf = samax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c new file mode 100644 index 000000000..3b8f03e6a --- /dev/null +++ b/kernel/zarch/samin.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = samin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c new file mode 100644 index 000000000..2c59ab2e5 --- /dev/null +++ b/kernel/zarch/sasum.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return sumf; + + if (inc_x == 1) { + + n1 = n & -64; + + if (n1 > 0) { + + sumf = sasum_kernel_64(n1, x); + i = n1; + } + + while (i < n) { + sumf += ABS(x[i]); + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += ABS(x[i]); + i += inc_x; + j++; + } + + + } + return sumf; +} + + diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c new file mode 100644 index 000000000..26ead310c --- /dev/null +++ b/kernel/zarch/saxpy.c @@ -0,0 +1,184 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( + "vlrepf %%v0,%3 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,80(%%r1,%1) \n\t" + "vl %%v26,96(%%r1,%1) \n\t" + "vl %%v27,112(%%r1,%1) \n\t" + "vl %%v28,64(%%r1,%2) \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vl %%v30,96(%%r1,%2) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "vl %%v16,128(%%r1,%1) \n\t" + "vl %%v17,144(%%r1,%1) \n\t" + "vl %%v18,160(%%r1,%1) \n\t" + "vl %%v19,176(%%r1,%1) \n\t" + "vl %%v20,128(%%r1,%2) \n\t" + "vl %%v21,144(%%r1,%2) \n\t" + "vl %%v22,160(%%r1,%2) \n\t" + "vl %%v23,176(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,192(%%r1,%1) \n\t" + "vl %%v25,208(%%r1,%1) \n\t" + "vl %%v26,224(%%r1,%1) \n\t" + "vl %%v27,240(%%r1,%1) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,128(%%r1,%2) \n\t" + "vst %%v17,144(%%r1,%2) \n\t" + "vst %%v18,160(%%r1,%2) \n\t" + "vst %%v19,176(%%r1,%2) \n\t" + "vst %%v20,192(%%r1,%2) \n\t" + "vst %%v21,208(%%r1,%2) \n\t" + "vst %%v22,224(%%r1,%2) \n\t" + "vst %%v23,240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return 0 ; + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + + if ( n1 ) + saxpy_kernel_64(n1, x, y , &da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return 0 ; + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return 0 ; + +} + + diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c new file mode 100644 index 000000000..ff4227595 --- /dev/null +++ b/kernel/zarch/scopy.c @@ -0,0 +1,85 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,6 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + if (n <= 0) return 0; + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + + while (i < n) { + y[i] = x[i]; + i++; + + } + + + } else { + + while (i < n) { + + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; + + } + + } + return 0; + + +} diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c new file mode 100644 index 000000000..fd8c8e445 --- /dev/null +++ b/kernel/zarch/sdot.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + FLOAT dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "vrepf %%v3,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "aebr %%f0,%%f2 \n\t" + "aebr %%f0,%%f3 \n\t" + "ler %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = sdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c new file mode 100644 index 000000000..92019d732 --- /dev/null +++ b/kernel/zarch/sgemv_n_4.c @@ -0,0 +1,668 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%5) \n\t" + "vlrepf %%v1,4(%5) \n\t" + "vlrepf %%v2,8(%5) \n\t" + "vlrepf %%v3,12(%5) \n\t" + "vlrepf %%v4,%7 \n\t" + "vfmsb %%v0,%%v0,%%v4 \n\t" + "vfmsb %%v1,%%v1,%%v4 \n\t" + "vfmsb %%v2,%%v2,%%v4 \n\t" + "vfmsb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%3) \n\t" + "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v2,%5 \n\t" + "vfmsb %%v0,%%v0,%%v2 \n\t" + "vfmsb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%2) \n\t" + "vlrepf %%v1,%4 \n\t" + "vfmsb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i]; + dest += inc_dest; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8],*ybuffer; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + ybuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c new file mode 100644 index 000000000..efc06297f --- /dev/null +++ b/kernel/zarch/sgemv_t_4.c @@ -0,0 +1,826 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,2 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,3 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "vrepf %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,2 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,3 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "vrepf %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,2 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,3 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "vrepf %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,2 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,3 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,3 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "vrepf %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,2 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,3 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j 0) { + + maxf = smax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c new file mode 100644 index 000000000..e882b7ff1 --- /dev/null +++ b/kernel/zarch/smin.c @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = smin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c new file mode 100644 index 000000000..763cc664a --- /dev/null +++ b/kernel/zarch/srot.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c new file mode 100644 index 000000000..c18a7e56f --- /dev/null +++ b/kernel/zarch/sscal.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} + +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + if ( n <= 0 || inc_x <=0 ) + return(0); + + + if ( inc_x == 1 ) + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + + sscal_kernel_32_zero(n1, x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + sscal_kernel_32(n1, da, x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i]=0.0; + x[i + inc_x]=0.0; + + i += inc_x * 2; + j += 2; + + } + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i] = da * x[i] ; + x[i + inc_x] = da * x[i + inc_x]; + + i += inc_x * 2; + j += 2; + + } + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c new file mode 100644 index 000000000..d0c0dc3f4 --- /dev/null +++ b/kernel/zarch/sswap.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + sswap_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c new file mode 100644 index 000000000..6393b099b --- /dev/null +++ b/kernel/zarch/zamax.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c new file mode 100644 index 000000000..b15774bb9 --- /dev/null +++ b/kernel/zarch/zamin.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 0fc5c9ecb..8faaf20eb 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { - +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) +{ FLOAT asum; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v22 \n\t" - "vzero %%v23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v23,%%v22 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum] ,%%f0" - : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + return asum; } - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; @@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( n1 > 0 ) { - sumf=zasum_kernel_16(n1, x ); + sumf = zasum_kernel_16(n1, x); i=n1; ip=2*n1; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 212de25c8..6ba44a27c 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" - -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { - - BLASLONG tempR1 ; - __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" - "pfd 2, 0(%[y_tmp]) \n\t" +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( #if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} - -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 1, 256(%[t1],%[x_tmp]) \n\t" - "pfd 2, 256(%[t1],%[y_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" - "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" - "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" - "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" - "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" - "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - + "vlrepg %%v0,0(%3) \n\t" + "vleg %%v1,8(%3),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%3),1 \n\t" +#else + "vleg %%v0,0(%3),1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,0(%3),0 \n\t" + "vlrepg %%v1,8(%3) \n\t" +#endif + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; + FLOAT da[2]; if (n <= 0) return (0); @@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; - if (n1) { - zaxpy_kernel_8(n1, x, y, da_r,da_i); + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); ix = 2 * n1; } i = n1; diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index b5bf383f7..8c940bba3 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - -#include "common.h" - -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) - : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; +#include "common.h" +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,4 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); } - int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; @@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } } - return(0); - + return(0); } - - diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 61c5d6b98..aab18e2e9 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" -#if defined(Z13) - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ __asm__ volatile( - "pfd 1, 0(%[ptr_x_tmp]) \n\t" - "pfd 1, 0(%[ptr_y_tmp]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %[n_tmp],%[n_tmp],3 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" - "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - - "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - "la %%r1,128(%%r1) \n\t" - "brctg %[n_tmp],1b \n\t" - "vfadb %%v24,%%v26,%%v24 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vsteg %%v24, 0(%[ptr_d]),0 \n\t" - "vsteg %%v24, 8(%[ptr_d]),1 \n\t" - "vsteg %%v25,16(%[ptr_d]),1 \n\t" - "vsteg %%v25,24(%[ptr_d]),0 \n\t" - : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) - : [mem_x] "m"( *(const double (*)[2*n])x), - [mem_y] "m"( *(const double (*)[2*n])y), - [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) - : "cc", "r1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - -} - -#else - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { - BLASLONG register i = 0; - FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; - BLASLONG j = 0; - - while (i < n) { - - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; - - dot[0] += x[j + 2] * y[j + 2]; - dot[1] += x[j + 3] * y[j + 3]; - dot[2] += x[j + 2] * y[j + 3]; - dot[3] += x[j + 3] * y[j + 2]; - - dot[0] += x[j + 4] * y[j + 4]; - dot[1] += x[j + 5] * y[j + 5]; - dot[2] += x[j + 4] * y[j + 5]; - dot[3] += x[j + 5] * y[j + 4]; - - dot[0] += x[j + 6] * y[j + 6]; - dot[1] += x[j + 7] * y[j + 7]; - dot[2] += x[j + 6] * y[j + 7]; - dot[3] += x[j + 7] * y[j + 6]; - - j += 8; - i += 4; - - } - d[0] = dot[0]; - d[1] = dot[1]; - d[2] = dot[2]; - d[3] = dot[3]; - + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v24,%%v24,%%v26 \n\t" + "vfadb %%v24,%%v24,%%v28 \n\t" + "vfadb %%v24,%%v24,%%v30 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vfadb %%v25,%%v25,%%v29 \n\t" + "vfadb %%v25,%%v25,%%v31 \n\t" + "vsteg %%v24,0(%3),0 \n\t" + "vsteg %%v24,8(%3),1 \n\t" + "vsteg %%v25,16(%3),1 \n\t" + "vsteg %%v25,24(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix=0, iy=0; + BLASLONG i; + BLASLONG ix, iy; OPENBLAS_COMPLEX_FLOAT result; FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; @@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { BLASLONG n1 = n & -8; - BLASLONG j=0; - if (n1){ + if (n1) zdot_kernel_8(n1, x, y, dot); - i = n1; - j = n1 <<1; - } - + + i = n1; + BLASLONG j = i * 2; while (i < n) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 380f0140e..75027a06c 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27,112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19,112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "clgrjl %%r1,%[tmp],1b \n\t" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc","r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) - { - zrot_kernel_16(n1, x, y, c, s); + { + FLOAT cosa,sina; + cosa=c; + sina=s; + zrot_kernel_16(n1, x, y, &cosa, &sina); i=n1; ix=2*n1; } @@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4764c0a52..4d8ee960f 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include "common.h" - - -static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { - BLASLONG tempR1 ; - __asm__ ( - "pfd 2, 0(%[x_tmp]) \n\t" -#if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} - -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 2, 256(%[t1],%[x_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmdb %%v30, %%v20, %%v28 \n\t" - "vfmdb %%v31, %%v21, %%v28 \n\t" - "vfmdb %%v6, %%v22, %%v28 \n\t" - "vfmdb %%v7, %%v23, %%v28 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - - "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - - +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "vleg %%v1,8(%1),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + "vpdi %%v28,%%v20,%%v20,4 \n\t" + "vpdi %%v29,%%v21,%%v21,4 \n\t" + "vpdi %%v30,%%v22,%%v22,4 \n\t" + "vpdi %%v31,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%1) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint - "vflcdb %%v16,%%v16 \n\t" //complement both - "vlvgg %%v16,%%r0,0 \n\t" //restore 1st - "vlr %%v17 ,%%v16 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v16 \n\t" - "vsteg %%v24, 0(%[x_ptr]),1 \n\t" - "vsteg %%v24, 8(%[x_ptr]),0 \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v17 \n\t" - "vsteg %%v25, 16(%[x_ptr]),1 \n\t" - "vsteg %%v25, 24(%[x_ptr]),0 \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vsteg %%v26, 32(%[x_ptr]),1 \n\t" - "vsteg %%v26, 40(%[x_ptr]),0 \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vsteg %%v27, 48(%[x_ptr]),1 \n\t" - "vsteg %%v27, 56(%[x_ptr]),0 \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v16 \n\t" - "vsteg %%v28, 64(%[x_ptr]),1 \n\t" - "vsteg %%v28, 72(%[x_ptr]),0 \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v17 \n\t" - "vsteg %%v29, 80(%[x_ptr]),1 \n\t" - "vsteg %%v29, 88(%[x_ptr]),0 \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vsteg %%v30, 96(%[x_ptr]),1 \n\t" - "vsteg %%v30, 104(%[x_ptr]),0 \n\t" - "vl %%v31, 112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vsteg %%v31, 112(%[x_ptr]),1 \n\t" - "vsteg %%v31, 120(%[x_ptr]),0 \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_i) - :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - - +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vleg %%v0,8(%1),0 \n\t" + "wflcdb %%v0,%%v0 \n\t" + "vleg %%v0,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v16,%%v16,%%v16,4 \n\t" + "vpdi %%v17,%%v17,%%v17,4 \n\t" + "vpdi %%v18,%%v18,%%v18,4 \n\t" + "vpdi %%v19,%%v19,%%v19,4 \n\t" + "vpdi %%v20,%%v20,%%v20,4 \n\t" + "vpdi %%v21,%%v21,%%v21,4 \n\t" + "vpdi %%v22,%%v22,%%v22,4 \n\t" + "vpdi %%v23,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v18,%%r0,%%r0 \n\t" - "vlr %%v19,%%v18 \n\t" - "vlr %%v16,%%v18 \n\t" - "vlr %%v17,%%v18 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v18 \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v19 \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vst %%v26, 32(%[x_ptr]) \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vst %%v27, 48(%[x_ptr]) \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v18 \n\t" - "vst %%v28, 64(%[x_ptr]) \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v19 \n\t" - "vst %%v29, 80(%[x_ptr]) \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vst %%v30, 96(%[x_ptr]) \n\t" - "vl %%v31,112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vst %%v31,112(%[x_ptr]) \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_r) - : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" - ); - +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256( %[x_ptr]) \n\t" - "vst %%v24, 0( %[x_ptr]) \n\t" - "vst %%v25, 16( %[x_ptr]) \n\t" - "vst %%v26, 32( %[x_ptr]) \n\t" - "vst %%v27, 48( %[x_ptr]) \n\t" - "vst %%v24, 64( %[x_ptr]) \n\t" - "vst %%v25, 80( %[x_ptr]) \n\t" - "vst %%v26, 96( %[x_ptr]) \n\t" - "vst %%v27,112( %[x_ptr]) \n\t" - - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" ,"r0","v24","v25","v26","v27" - ); - +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - - -static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { - +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; - for (i = 0; i < n; i += 4) { + for (i = 0; i < n; i += 4) + { t0 = da_r * x[0] - da_i * x[1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; @@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS x[inc_x3] = t3; x += 4 * inc_x; - } - - } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; FLOAT temp0; FLOAT temp1; - + FLOAT alpha[2] __attribute__ ((aligned(16))); if (inc_x != 1) { inc_x <<= 1; @@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { BLASLONG n1 = n & -8; - if (n1 > 0) { - zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); j = n1; i = n1 * inc_x; } @@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; if (da_r == 0.0) if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, da_i, x); + zscal_kernel_8_zero_r(n1, alpha, x); else if (da_i == 0) - zscal_kernel_8_zero_i(n1, da_r, x); + zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, da_r,da_i, x); + zscal_kernel_8(n1, alpha, x); i = n1 << 1; j = n1; @@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return (0); } - - diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 062079002..a16b87cdc 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" - -#if defined(Z13_SWAP_A) -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ volatile( + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#endif - - - - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; diff --git a/ztest/Makefile b/ztest/Makefile new file mode 100644 index 000000000..0ff7fe46a --- /dev/null +++ b/ztest/Makefile @@ -0,0 +1,437 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto + +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dsdot #################################################### +dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Crot #################################################### +crot.goto : crot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zrot #################################################### +zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +dsdot.$(SUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +crot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +clean :: + @rm -f *.goto + diff --git a/ztest/amax.c b/ztest/amax.c new file mode 100644 index 000000000..f2e3f5411 --- /dev/null +++ b/ztest/amax.c @@ -0,0 +1,235 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef AMAX +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef AMIN +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} +#else +FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} +#endif + +#undef ASUM +#ifdef COMPLEX +#ifdef DOUBLE +#define ASUM BLASFUNC(dzasum) +#else +#define ASUM BLASFUNC(scasum) +#endif +#else +#ifdef DOUBLE +#define ASUM BLASFUNC(dasum) +#else +#define ASUM BLASFUNC(sasum) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} +#else +int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef AXPY +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPY BLASFUNC(zaxpy) +#else +#define AXPY BLASFUNC(caxpy) +#endif +#else +#ifdef DOUBLE +#define AXPY BLASFUNC(daxpy) +#else +#define AXPY BLASFUNC(saxpy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c;; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + argc--;argv++; + + blasint iy; + int test = 1; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} +#else +int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef COPY +#ifdef COMPLEX +#ifdef DOUBLE +#define COPY BLASFUNC(zcopy) +#else +#define COPY BLASFUNC(ccopy) +#endif +#else +#ifdef DOUBLE +#define COPY BLASFUNC(dcopy) +#else +#define COPY BLASFUNC(scopy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} +#else +FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} +#endif + +#undef DOT +#ifdef COMPLEX +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif +#else +#ifdef DOUBLE +#define DOT BLASFUNC(ddot) +#else +#define DOT BLASFUNC(sdot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; +#ifdef COMPLEX + OPENBLAS_COMPLEX_FLOAT result, result_c; +#else + FLOAT result, result_c; +#endif + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} + +#undef DSDOT +#define DSDOT BLASFUNC(dsdot) + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + double result, result_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; jtv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y, *y_c; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char trans='N'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + blasint n=0; + int has_param_n = 0; + int has_param_m = 0; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + + int tomax = to; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_PARAM_N"))) { + n = atoi(p); + if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; + } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } + + + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + if (has_param_m == 0) + { + + for(m = from; m <= to; m += step) + { + timeg=0; + timeg_c=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d :", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} +#else +BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} +#endif + +#undef IAMAX +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} +#else +BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} +#endif + +#undef IAMIN +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMIN BLASFUNC(izamin) +#else +#define IAMIN BLASFUNC(icamin) +#endif +#else +#ifdef DOUBLE +#define IAMIN BLASFUNC(idamin) +#else +#define IAMIN BLASFUNC(isamin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + +#undef IMAX +#ifdef DOUBLE +#define IMAX BLASFUNC(idmax) +#else +#define IMAX BLASFUNC(ismax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + +#undef IMIN +#ifdef DOUBLE +#define IMIN BLASFUNC(idmin) +#else +#define IMIN BLASFUNC(ismin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef MAX_ +#ifdef DOUBLE +#define MAX_ BLASFUNC(dmax) +#else +#define MAX_ BLASFUNC(smax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef MIN_ +#ifdef DOUBLE +#define MIN_ BLASFUNC(dmin) +#else +#define MIN_ BLASFUNC(smin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef ROT +#ifdef COMPLEX +#ifdef DOUBLE +#define ROT BLASFUNC(zdrot) +#else +#define ROT BLASFUNC(csrot) +#endif +#else +#ifdef DOUBLE +#define ROT BLASFUNC(drot) +#else +#define ROT BLASFUNC(srot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + // FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + FLOAT c[1] = { 2.0 }; + FLOAT s[1] = { 2.0 }; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; itv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *x_c; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef SWAP +#ifdef COMPLEX +#ifdef DOUBLE +#define SWAP BLASFUNC(zswap) +#else +#define SWAP BLASFUNC(cswap) +#endif +#else +#ifdef DOUBLE +#define SWAP BLASFUNC(dswap) +#else +#define SWAP BLASFUNC(sswap) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l Date: Mon, 6 Aug 2018 20:03:49 +0300 Subject: [PATCH 163/935] [ZARCH] Restore detect() function --- cpuid_zarch.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 0ae32f27d..073419fa8 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -45,9 +45,29 @@ static char *cpuname_lower[] = { int detect(void) { - // return CPU_GENERIC; - return CPU_Z14; - + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/sysinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("Type", buffer, 4)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if (strstr(p, "2964")) return CPU_Z13; + if (strstr(p, "2965")) return CPU_Z13; + if (strstr(p, "3906")) return CPU_Z14; + if (strstr(p, "3907")) return CPU_Z14; + + return CPU_GENERIC; } void get_libname(void) From 7932ff3ea9666ab022c20354672b2597c756ee02 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 8 Aug 2018 02:59:11 +0000 Subject: [PATCH 164/935] Add an AVX512 enabled DDOT function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/ddot.c | 4 +- kernel/x86_64/ddot_microk_skylakex-2.c | 96 ++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/ddot_microk_skylakex-2.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0dc9cd3da..969357614 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c new file mode 100644 index 000000000..8eabf225a --- /dev/null +++ b/kernel/x86_64/ddot_microk_skylakex-2.c @@ -0,0 +1,96 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_8 1 + +#include + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + int i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + int n32; + n32 = n & (~31); + + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + for (; i < n32; i += 32) { + accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]); + accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]); + accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]); + accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1); + accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1); + accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1); + accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1); + +#endif + for (; i < n; i += 16) { + accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]); + accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]); + accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]); + accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "ddot_microk_haswell-2.c" +#endif From 33043f563fb6849d4afee45cbcf85d03aa561a4e Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 10 Aug 2018 01:54:18 +0300 Subject: [PATCH 165/935] Disable scal to benchmark zgemv separately by default --- benchmark/gemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index c06e829d9..b6a42f42f 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 0.0}; char trans='N'; blasint m, i, j; blasint inc_x=1,inc_y=1; From 00abaa865bea441f20bb29b35dfb0524f112b34e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 10 Aug 2018 02:31:48 +0000 Subject: [PATCH 166/935] Add an AVX512 enabled SDOT function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/sdot.c | 4 +- kernel/x86_64/sdot_microk_skylakex-2.c | 98 ++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sdot_microk_skylakex-2.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index c3ab2ffe6..3536afc9e 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c new file mode 100644 index 000000000..4740161f4 --- /dev/null +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + +{ + int i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + +#ifdef __AVX512CD__ + __m512 accum_05, accum_15, accum_25, accum_35; + int n64; + n64 = n & (~63); + + accum_05 = _mm512_setzero_ps(); + accum_15 = _mm512_setzero_ps(); + accum_25 = _mm512_setzero_ps(); + accum_35 = _mm512_setzero_ps(); + + for (; i < n64; i += 64) { + accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]); + accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]); + accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]); + accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]); + } + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); + accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); + accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); + accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1)) + +#endif + for (; i < n; i += 32) { + accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]); + accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]); + accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]); + accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]); + } + + /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + + /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ + half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + *dot = half_accum0[0]; +} + +#else +#include "sdot_microk_haswell-2.c" +#endif From 2e99873ff7112b6b35d35cf87eb34762f3f3d38b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 10 Aug 2018 02:58:32 +0000 Subject: [PATCH 167/935] Add a AVX512 enabled SAXPY/DAXPY functions written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/daxpy.c | 4 +- kernel/x86_64/daxpy_microk_skylakex-2.c | 71 +++++++++++++++++++++++++ kernel/x86_64/saxpy.c | 4 +- kernel/x86_64/saxpy_microk_skylakex-2.c | 69 ++++++++++++++++++++++++ 4 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/daxpy_microk_skylakex-2.c create mode 100644 kernel/x86_64/saxpy_microk_skylakex-2.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index b4acdccd2..cde5bdaa6 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c new file mode 100644 index 000000000..e785a39f1 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_skylakex-2.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256d __alpha; + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n32; + __m512d __alpha5; + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + n32 = n & ~31; + + for (; i < n32; i+= 32) { + _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); + _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); + _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); + } + +#endif + + for (; i < n; i+= 16) { + _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); + } +} +#else +#include "daxpy_microk_haswell-2.c" +#endif + + diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89c4070d..e1349da58 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c new file mode 100644 index 000000000..950f10ba2 --- /dev/null +++ b/kernel/x86_64/saxpy_microk_skylakex-2.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i = 0; + + __m256 __alpha; + + __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); + +#ifdef __AVX512CD__ + BLASLONG n64; + __m512 __alpha5; + __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); + + n64 = n & ~63; + + for (; i < n64; i+= 64) { + _mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16])); + _mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32])); + _mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48])); + } + +#endif + + for (; i < n; i+= 32) { + _mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24])); + } +} +#else +#include "saxpy_microk_haswell-2.c" +#endif + From c52a831ae446a4ea9ead4948a2d1ab38034677b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 10 Aug 2018 13:23:47 +0200 Subject: [PATCH 168/935] Add changes from the 0.3.x releases fixes #1727 --- Changelog.txt | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index cb6fee70a..33dcacc51 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,115 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.2 +30-Jul-2018 + +common: + * fixes for regressions caused by the rewrite of the thread + initialization code in 0.3.1 + +POWER: + * fixed cpu autodetection for the BSDs + +MIPS64: + * fixed utest errors in AXPY, DSDOT, ROT and SWAP + +x86_64: + * added autodetection of AMD Ryzen 2 + * fixed build with older versions of MSVC + +==================================================================== +Version 0.3.1 +01-Jul-2018 + +common: + * rewritten thread initialization code with significantly reduced overhead + * added CBLAS interfaces to the IxAMIN BLAS extension functions + * fixed the lapack-test target + * CMAKE builds now create an OpenBLASConfig.cmake file + * ZAXPY now uses a single thread for small input sizes + * the LAPACK code was updated from Reference-LAPACK/lapack#253 + (fixing LAPACKE interfaces to Aasen's functions) + +POWER: + * corrected CROT and ZROT behaviour with zero INC_X + +ARMV7: + * corrected xDOT behaviour with zero INC_X or INC_Y + +x86_64: + * retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER, + this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO + (which will still be supported via the slower PRESCOTT kernels when this option is not set) + * added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to + specify the list of x86_64 targets to include. Any target not on the list will be supported + by the Sandybridge or Nehalem kernels if available, or by Prescott. + * improved SWITCH_RATIO on Haswell for increased GEMM throughput + * added initial support for Intel Skylake X, including an AVX512 SGEMM kernel + * added autodetection of Intel Cannon Lake series as Skylake X + * added a default L2 cache size for hypervisors that return zero here (Chromebook) + * fixed a name clash with recent Windows10 headers that broke the build with (at least) + recent mingw from MSYS2 + * fixed a link error in mixed clang/gfortran builds with OpenMP + * updated the OSX deployment target to 10.8 + * switched on parallel make for builds on MS Windows by default + +x86: + * fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y + +==================================================================== +Version 0.3.0 +23-May-2108 + +common: + * fixed some more thread race and locking bugs + * added preliminary support for calling an OpenMP build of the library from multiple threads + * removed performance impact of thread locks added in 0.2.20 on OpenMP code + * general code cleanup + * optimized DSDOT implementation + * improved thread distribution for GEMM + * corrected IMATCOPY/OMATCOPY implementation + * fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations + * cmake build improvements + * pkgconfig file now contains build options + * openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build + * corrections and improvements for systems with more than 64 cpus + * LAPACK code updated to 3.8.0 including later fixes + * added ReLAPACK, a recursive implementation of several LAPACK functions + * Rewrote ROTMG to handle cases that the netlib code failed to address + * Disabled (broken) multithreading code for xTRMV + * corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard + * shared memory access failures on startup are now handled more gracefully + * restored utests from earlier releases (and made them pass on all affected systems) + +SPARC: + * several fixes for cpu autodetection + +POWER: + * corrected vector register overwriting in several Power8 kernels + * optimized additional BLAS functions + +ARM: + * added support for CortexA53 and A72 + * added autodetection for ThunderX2T99 + * made most optimized kernels the default for generic ARMv8 targets + +x86_64: + * parallelized DDOT kernel for Haswell + * changed alignment directives in assembly kernels to boost performance on OSX + * fixed register handling in the GEMV microkernels (bug exposed by gcc7) + * added support for building on OpenBSD and Dragonfly + * updated compiler options to work with Intel release 2018 + * support fully optimized build with clang/flang on Microsoft Windows + * fixed building on AIX + +IBM Z: + * added optimized BLAS 1/2 functions + +MIPS: + * fixed cpu autodetection helper code + * added mips32 1004K cpu (Mediatek MT7621 and similar SoC) + * added mips64 I6500 cpu + ==================================================================== Version 0.2.20 24-Jul-2017 From cacacc8007eaf8c01ca32f289980ee8b91016b8f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:14:57 +0000 Subject: [PATCH 169/935] Add an AVX512 enabled DSCAL function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/dscal.c | 4 +- kernel/x86_64/dscal_microk_skylakex-2.c | 77 +++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dscal_microk_skylakex-2.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 2c7b3b17c..ef9a0a6ba 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c new file mode 100644 index 000000000..e0598272e --- /dev/null +++ b/kernel/x86_64/dscal_microk_skylakex-2.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + +#ifdef __AVX512CD__ + __m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0])); + } +#else + __m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4])); + } +#endif +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n; i += 8) { + _mm512_storeu_pd(&x[i], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n; i += 8) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + } +#endif + +} + +#else +#include "dscal_microk_haswell-2.c" +#endif From 36add7570a17c859ed51cb8e016286ce40c09293 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:16:45 +0000 Subject: [PATCH 170/935] Fix typo in sdot function it looks like my previous pull request was short the final commit; fix a typo in sdot --- kernel/x86_64/sdot_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c index 4740161f4..1fcb7f27c 100644 --- a/kernel/x86_64/sdot_microk_skylakex-2.c +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -67,7 +67,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); - accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1)) + accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1); #endif for (; i < n; i += 32) { From 9493f263092d059fcf28f17e621f7396f776db80 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:21:46 +0000 Subject: [PATCH 171/935] add short blurb about avx512 and needed compiler to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 02d087334..9ed9be337 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. +- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. +* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. From 87bebdbd8aacf30741222b722d5f7bde1e51c739 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:38:12 +0000 Subject: [PATCH 172/935] Add an AVX512 enabled DGEMV (n) function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/dgemv_n_4.c | 4 +- kernel/x86_64/dgemv_n_microk_skylakex-4.c | 126 ++++++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemv_n_microk_skylakex-4.c diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 309fbe767..6d2530e81 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) +#include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c new file mode 100644 index 000000000..4030399ab --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_4x4 1 + +#include + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1, x2, x3; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2])); + x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + +#ifdef __AVX512CD__ + int n5; + __m512d x05, x15, x25, x35; + __m512d __alpha5; + n5 = n & ~7; + + x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0])); + x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1])); + x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2])); + x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3])); + + __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); + + for (; i < n5; i+= 8) { + __m512d tempY; + __m512d sum; + + sum = _mm512_loadu_pd(&ap[0][i]) * x05 + + _mm512_loadu_pd(&ap[1][i]) * x15 + + _mm512_loadu_pd(&ap[2][i]) * x25 + + _mm512_loadu_pd(&ap[3][i]) * x35; + + tempY = _mm512_loadu_pd(&y[i]); + tempY += sum * __alpha5; + _mm512_storeu_pd(&y[i], tempY); + } +#endif + + for (; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + + _mm256_loadu_pd(&ap[1][i]) * x1 + + _mm256_loadu_pd(&ap[2][i]) * x2 + + _mm256_loadu_pd(&ap[3][i]) * x3; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + + +#define HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + int i = 0; + + __m256d x0, x1; + __m256d __alpha; + + x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); + x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); + + __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); + + + for (i = 0; i < n; i+= 4) { + __m256d tempY; + __m256d sum; + + sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1; + + tempY = _mm256_loadu_pd(&y[i]); + tempY += sum * __alpha; + _mm256_storeu_pd(&y[i], tempY); + } + +} + +#else +#include "dgemv_n_microk_haswell-4.c" +#endif From 9bec34cb672843a872bf5338518c73bf32414239 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Aug 2018 17:46:24 +0000 Subject: [PATCH 173/935] Add an AVX512 enabled DSYMV (L) function written in C intrinsics for best readability. (the same C code works for Haswell as well) For logistical reasons the code falls back to the existing haswell AVX2 implementation if the GCC or LLVM compiler is not new enough --- kernel/x86_64/dsymv_L.c | 4 +- kernel/x86_64/dsymv_L_microk_skylakex-2.c | 161 ++++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dsymv_L_microk_skylakex-2.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 73099462c..a722cc9df 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" +#elif defined (SKYLAKEX) +#include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" #elif defined(NEHALEM) diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c new file mode 100644 index 000000000..8244dffa1 --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_4x4 1 + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __m256d accum_0, accum_1, accum_2, accum_3; + __m256d temp1_0, temp1_1, temp1_2, temp1_3; + + /* the 256 bit wide acculmulator vectors start out as zero */ + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); + +#ifdef __AVX512CD__ + __m512d accum_05, accum_15, accum_25, accum_35; + __m512d temp1_05, temp1_15, temp1_25, temp1_35; + BLASLONG to2; + int delta; + + /* the 512 bit wide accumulator vectors start out as zero */ + accum_05 = _mm512_setzero_pd(); + accum_15 = _mm512_setzero_pd(); + accum_25 = _mm512_setzero_pd(); + accum_35 = _mm512_setzero_pd(); + + temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); + temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); + temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2])); + temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3])); + + delta = (to - from) & ~7; + to2 = from + delta; + + + for (; from < to2; from += 8) { + __m512d _x, _y; + __m512d a0, a1, a2, a3; + + _y = _mm512_loadu_pd(&y[from]); + _x = _mm512_loadu_pd(&x[from]); + + a0 = _mm512_loadu_pd(&a[0][from]); + a1 = _mm512_loadu_pd(&a[1][from]); + a2 = _mm512_loadu_pd(&a[2][from]); + a3 = _mm512_loadu_pd(&a[3][from]); + + _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; + + accum_05 += _x * a0; + accum_15 += _x * a1; + accum_25 += _x * a2; + accum_35 += _x * a3; + + _mm512_storeu_pd(&y[from], _y); + + }; + + /* + * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code + * below can continue using the intermediate results in its loop + */ + accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); + accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); + accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); + accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); + +#endif + + for (; from != to; from += 4) { + __m256d _x, _y; + __m256d a0, a1, a2, a3; + + _y = _mm256_loadu_pd(&y[from]); + _x = _mm256_loadu_pd(&x[from]); + + /* load 4 rows of matrix data */ + a0 = _mm256_loadu_pd(&a[0][from]); + a1 = _mm256_loadu_pd(&a[1][from]); + a2 = _mm256_loadu_pd(&a[2][from]); + a3 = _mm256_loadu_pd(&a[3][from]); + + _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; + + accum_0 += _x * a0; + accum_1 += _x * a1; + accum_2 += _x * a2; + accum_3 += _x * a3; + + _mm256_storeu_pd(&y[from], _y); + + }; + + /* + * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2 + * output array. There is no direct instruction for this in 256 bit space, only in 128 space. + */ + + __m128d half_accum0, half_accum1, half_accum2, half_accum3; + + + /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */ + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1)); + half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1)); + half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1)); + + /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + half_accum1 = _mm_hadd_pd(half_accum1, half_accum1); + half_accum2 = _mm_hadd_pd(half_accum2, half_accum2); + half_accum3 = _mm_hadd_pd(half_accum3, half_accum3); + + /* and store the lowest double value from each of these vectors in the temp2 output */ + temp2[0] += half_accum0[0]; + temp2[1] += half_accum1[0]; + temp2[2] += half_accum2[0]; + temp2[3] += half_accum3[0]; +} +#else +#include "dsymv_L_microk_haswell-2.c" +#endif \ No newline at end of file From e6c0e39492d49eded5a72c9882b79bed7bff35d0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 13 Aug 2018 12:23:40 +0300 Subject: [PATCH 174/935] Optimize Zgemv --- cpuid_zarch.c | 8 +- kernel/zarch/KERNEL.Z13 | 4 +- kernel/zarch/KERNEL.Z14 | 8 +- kernel/zarch/camax.c | 46 +- kernel/zarch/camin.c | 46 +- kernel/zarch/caxpy.c | 4 +- kernel/zarch/cgemv_n_4.c | 743 ++++++++++++++++++++ kernel/zarch/cgemv_t_4.c | 671 ++++++++++++++++++ kernel/zarch/icamax.c | 9 +- kernel/zarch/icamin.c | 9 +- kernel/zarch/idamax.c | 11 +- kernel/zarch/idamin.c | 11 +- kernel/zarch/idmax.c | 11 +- kernel/zarch/idmin.c | 11 +- kernel/zarch/isamax.c | 11 +- kernel/zarch/isamin.c | 11 +- kernel/zarch/ismax.c | 11 +- kernel/zarch/ismin.c | 11 +- kernel/zarch/izamax.c | 9 +- kernel/zarch/izamin.c | 9 +- kernel/zarch/zamax.c | 48 +- kernel/zarch/zamin.c | 46 +- kernel/zarch/zaxpy.c | 4 +- kernel/zarch/zgemv_n_4.c | 1401 ++++++++++++++++---------------------- kernel/zarch/zgemv_t_4.c | 1267 +++++++++++++++------------------- ztest/gemv.c | 159 +++-- 26 files changed, 2866 insertions(+), 1713 deletions(-) create mode 100644 kernel/zarch/cgemv_n_4.c create mode 100644 kernel/zarch/cgemv_t_4.c diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 073419fa8..8ed40099b 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,9 +27,9 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", @@ -112,7 +112,7 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; - case CPU_Z14: + case CPU_Z14: printf("#define Z14\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index d39b9d904..e5b974ab4 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index fa88b6881..80f78f48f 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -73,13 +73,13 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n_4.c DGEMVNKERNEL = dgemv_n_4.c -CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVNKERNEL = cgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVTKERNEL = dgemv_t_4.c -CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +CGEMVTKERNEL = cgemv_t_4.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 6394be769..3506c4e9b 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - inc_x2 = 2 * inc_x; maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 936c300c8..726747b99 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index 2176f3dcd..fe5568cc8 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -110,7 +110,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -118,7 +118,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c new file mode 100644 index 000000000..4c3253774 --- /dev/null +++ b/kernel/zarch/cgemv_n_4.c @@ -0,0 +1,743 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define NBMAX 1024 + +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" +#else + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" + + "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" +#else + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%2) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" +#else + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); +} + +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( +#if !defined(XCONJ) + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,3 \n\t" +#else + "vlef %%v0,%3,1 \n\t" + "vlef %%v0,%3,3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,%3,0 \n\t" + "vlef %%v0,%3,2 \n\t" + "vlrepf %%v1,%4 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,0(%%r1,%2) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + + "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" + + "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" + + "vst %%v22,0(%%r1,%2) \n\t" + "vst %%v23,16(%%r1,%2) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); +} diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index e7f096e0d..9b4077c6b 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index b9c1ccd9c..6e952a325 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index aba880949..d1f135369 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3213efa4d..679606a8f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 26fff4eb0..5de41ac7b 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 570b33a15..7fec111cf 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 95a665b10..d2686c0cd 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 640fc02c9..768f31a8c 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0eb350315..8fc32adf6 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index f050db8cb..415052810 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index bf5f621a7..541464b05 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 3636e8fdf..4b5572b80 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 6393b099b..937bc9753 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -150,7 +150,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - - inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index b15774bb9..8564edaf4 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -150,7 +150,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 6ba44a27c..f0e993d2f 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -106,7 +106,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -114,7 +114,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 484db3073..9472b5d5a 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2018, The OpenBLAS Project +Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,898 +23,693 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include #include #include "common.h" -#define HAVE_KERNEL_4x4_VEC 1 -#define HAVE_KERNEL_4x2_VEC 1 -#define HAVE_KERNEL_4x1_VEC 1 -#define HAVE_KERNEL_ADDY 1 - -#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) -#include -#endif - -// #define NBMAX 1024 -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%5) \n\t" + "vl %%v17,16(%5) \n\t" + "vl %%v18,32(%5) \n\t" + "vl %%v19,48(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - register __vector double vx2_r = {x[4], x[4]}; - register __vector double vx2_i = {-x[5], x[5]}; - register __vector double vx3_r = {x[6], x[6]}; - register __vector double vx3_i = {-x[7], x[7]}; - + "vleg %%v20,8(%5),0 \n\t" + "wflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),1 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "wflcdb %%v22,%%v22 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vleg %%v23,56(%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,48(%5),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; - register __vector double vx2_r = {x[4], -x[4]}; - register __vector double vx2_i = {x[5], x[5]}; - register __vector double vx3_r = {x[6], -x[6]}; - register __vector double vx3_i = {x[7], x[7]}; + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,8(%5),0 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vflcdb %%v22,%%v22 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "vleg %%v23,48(%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,56(%5),0 \n\t" #endif - - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - register __vector double *vptr_a2 = (__vector double *) a2; - register __vector double *vptr_a3 = (__vector double *) a3; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - register __vector double va2 = vptr_a2[i]; - register __vector double va2_1 = vptr_a2[i + 1]; - register __vector double va2_2 = vptr_a2[i + 2]; - register __vector double va2_3 = vptr_a2[i + 3]; - - register __vector double va3 = vptr_a3[i]; - register __vector double va3_1 = vptr_a3[i + 1]; - register __vector double va3_2 = vptr_a3[i + 2]; - register __vector double va3_3 = vptr_a3[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va2*vx2_r; - vy_1 += va2_1*vx2_r; - vy_2 += va2_2*vx2_r; - vy_3 += va2_3*vx2_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va3*vx3_r; - vy_1 += va3_1*vx3_r; - vy_2 += va3_2*vx3_r; - vy_3 += va3_3*vx3_r; - - va2 = vec_permi(va2, va2, 2); - va2_1 = vec_permi(va2_1, va2_1, 2); - va2_2 = vec_permi(va2_2, va2_2, 2); - va2_3 = vec_permi(va2_3, va2_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - va3 = vec_permi(va3, va3, 2); - va3_1 = vec_permi(va3_1, va3_1, 2); - va3_2 = vec_permi(va3_2, va3_2, 2); - va3_3 = vec_permi(va3_3, va3_3, 2); - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy_0 += va2*vx2_i; - vy_1 += va2_1*vx2_i; - vy_2 += va2_2*vx2_i; - vy_3 += va2_3*vx2_i; - - vy_0 += va3*vx3_i; - vy_1 += va3_1*vx3_i; - vy_2 += va3_2*vx3_i; - vy_3 += va3_3*vx3_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" + + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - - for (i = 0; i < 2 * n; i += 2) { +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%3) \n\t" + "vl %%v17,16(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; + "vleg %%v18,8(%3),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%3),1 \n\t" + "vleg %%v19,24(%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%3),1 \n\t" +#else + "vleg %%v18,0(%3),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%3),0 \n\t" + "vleg %%v19,16(%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%3),0 \n\t" #endif - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - - +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - + "vleg %%v17,8(%2),0 \n\t" + "wflcdb %%v17,%%v17 \n\t" + "vleg %%v17,0(%2),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; + "vleg %%v17,0(%2),1 \n\t" + "vflcdb %%v17,%%v17 \n\t" + "vleg %%v17,8(%2),0 \n\t" #endif - - - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vlrepg %%v18,0(%%r1,%1) \n\t" + "vlrepg %%v19,8(%%r1,%1) \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" + + "vlrepg %%v18,16(%%r1,%1) \n\t" + "vlrepg %%v19,24(%%r1,%1) \n\t" + + "vl %%v0,16(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); } -#else - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - for (i = 0; i < 2 * n; i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( +#if !defined(XCONJ) + "vlrepg %%v0,%3 \n\t" + "vleg %%v1,%4,0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,%4,1 \n\t" +#else + "vleg %%v0,%3,1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,%3,0 \n\t" + "vlrepg %%v1,%4 \n\t" #endif - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "agfi %%r1,64 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; + if ( inc_dest != 2 ) + { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -#endif - -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - register __vector double vtemp2_p = {0.0, 0.0}; - register __vector double vtemp2_r = {0.0, 0.0}; - register __vector double vtemp3_p = {0.0, 0.0}; - register __vector double vtemp3_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { -// __builtin_prefetch(&x[i]); -// __builtin_prefetch(&a0[i]); -// __builtin_prefetch(&a1[i]); -// __builtin_prefetch(&a2[i]); -// __builtin_prefetch(&a3[i]); - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double va2 = *(__vector double*) (&a2[i]); - register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); - register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); - register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); - - register __vector double va3 = *(__vector double*) (&a3[i]); - register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); - register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); - register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vtemp2_p += vx_0*va2; - vtemp2_r += vxr_0*va2; - - vtemp3_p += vx_0*va3; - vtemp3_r += vxr_0*va3; - - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp2_p += vx_1*va2_1; - vtemp2_r += vxr_1*va2_1; - - vtemp3_p += vx_1*va3_1; - vtemp3_r += vxr_1*va3_1; - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp2_p += vx_2*va2_2; - vtemp2_r += vxr_0*va2_2; - - vtemp3_p += vx_2*va3_2; - vtemp3_r += vxr_0*va3_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - vtemp2_p += vx_3*va2_3; - vtemp2_r += vxr_1*va2_3; - - vtemp3_p += vx_3*va3_3; - vtemp3_r += vxr_1*va3_3; - - } +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "vzero %%v18 \n\t" + "vzero %%v19 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v20,0(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - + "vleg %%v21,8(%%r1,%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,0(%%r1,%5),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v21,0(%%r1,%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%%r1,%5),0 \n\t" #endif -} -#else - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_r2 = 0.0; - FLOAT temp_r3 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - FLOAT temp_i2 = 0.0; - FLOAT temp_i3 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v20,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v21,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v20,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v21,%%v17 \n\t" + + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v20,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v21,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v20,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v21,%%v19 \n\t" + + "vl %%v22,16(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; + "vleg %%v23,24(%%r1,%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,16(%%r1,%5),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; + "vleg %%v23,16(%%r1,%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,24(%%r1,%5),0 \n\t" #endif - } + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v22,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v23,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v22,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v23,%%v17 \n\t" + + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v22,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v23,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v22,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v23,%%v19 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - + "vlrepg %%v24,0(%7) \n\t" + "vleg %%v25,8(%7),0 \n\t" + "wflcdb %%v25,%%v25 \n\t" + "vleg %%v25,8(%7),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v24,0(%7),1 \n\t" + "vflcdb %%v24,%%v24 \n\t" + "vleg %%v24,0(%7),0 \n\t" + "vlrepg %%v25,8(%7) \n\t" #endif + "vl %%v26,0(%6) \n\t" + "vl %%v27,16(%6) \n\t" + "vl %%v28,32(%6) \n\t" + "vl %%v29,48(%6) \n\t" + "vfmadb %%v26,%%v16,%%v24,%%v26 \n\t" + "vfmadb %%v26,%%v20,%%v25,%%v26 \n\t" + "vfmadb %%v27,%%v17,%%v24,%%v27 \n\t" + "vfmadb %%v27,%%v21,%%v25,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v24,%%v28 \n\t" + "vfmadb %%v28,%%v22,%%v25,%%v28 \n\t" + "vfmadb %%v29,%%v19,%%v24,%%v29 \n\t" + "vfmadb %%v29,%%v23,%%v25,%%v29 \n\t" + "vst %%v26,0(%6) \n\t" + "vst %%v27,16(%6) \n\t" + "vst %%v28,32(%6) \n\t" + "vst %%v29,48(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - } - +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v18,0(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - + "vleg %%v19,8(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,0(%%r1,%3),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v19,0(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%%r1,%3),0 \n\t" #endif -} - -#else - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" + + "vl %%v18,16(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; + "vleg %%v19,24(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%%r1,%3),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; + "vleg %%v19,16(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%%r1,%3),0 \n\t" #endif - } + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v18,%%v16,%%v16,4 \n\t" + "vpdi %%v19,%%v17,%%v17,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - + "vlrepg %%v20,0(%5) \n\t" + "vleg %%v21,8(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%5),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),0 \n\t" + "vlrepg %%v21,8(%5) \n\t" #endif + "vl %%v22,0(%4) \n\t" + "vl %%v23,16(%4) \n\t" + "vfmadb %%v22,%%v16,%%v20,%%v22 \n\t" + "vfmadb %%v22,%%v18,%%v21,%%v22 \n\t" + "vfmadb %%v23,%%v17,%%v20,%%v23 \n\t" + "vfmadb %%v23,%%v19,%%v21,%%v23 \n\t" + "vst %%v22,0(%4) \n\t" + "vst %%v23,16(%4) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0 ; - a0 = ap; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - } - +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v17,0(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - + "vleg %%v18,8(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%%r1,%2),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + "vleg %%v18,0(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%%r1,%2),0 \n\t" #endif -} - -#else - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0; - a0 = ap; + "vlrepg %%v19,0(%%r1,%1) \n\t" + "vlrepg %%v20,8(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vl %%v17,16(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + "vleg %%v18,24(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,16(%%r1,%2),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + "vleg %%v18,16(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,24(%%r1,%2),0 \n\t" #endif - } -#if !defined(XCONJ) + "vlrepg %%v19,16(%%r1,%1) \n\t" + "vlrepg %%v20,24(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + "vpdi %%v17,%%v16,%%v16,4 \n\t" +#if !defined(XCONJ) + "vlrepg %%v18,0(%4) \n\t" + "vleg %%v19,8(%4),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%4),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - + "vleg %%v18,0(%4),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%4),0 \n\t" + "vlrepg %%v19,8(%4) \n\t" #endif - + "vl %%v20,0(%3) \n\t" + "vfmadb %%v20,%%v16,%%v18,%%v20 \n\t" + "vfmadb %%v20,%%v17,%%v19,%%v20 \n\t" + "vst %%v20,0(%3) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20" + ); } -#endif - -static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8],*xbuffer; + FLOAT alpha[2]; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return (0); - } + return(0); + } - if (m3 == 1) { - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; + if ( m3 == 1 ) + { - while (j < (n & -2)) { + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + return(0); } - diff --git a/ztest/gemv.c b/ztest/gemv.c index f1ee972bc..964afd3ef 100644 --- a/ztest/gemv.c +++ b/ztest/gemv.c @@ -52,67 +52,66 @@ int assert_dbl_near(double exp, double real, double tol) { int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; - BLASLONG ix,iy; + BLASLONG ix, iy; BLASLONG j; FLOAT *a_ptr; - FLOAT temp_r,temp_i; - BLASLONG inc_x2,inc_y2; + FLOAT temp_r, temp_i; + BLASLONG inc_x2, inc_y2; BLASLONG lda2; BLASLONG i2; - lda2 = 2*lda; + lda2 = 2 * lda; ix = 0; a_ptr = a; - if ( inc_x == 1 && inc_y == 1 ) + if (inc_x == 1 && inc_y == 1) { - for (j=0; j Date: Tue, 14 Aug 2018 12:30:38 +0200 Subject: [PATCH 175/935] detect z14 arch on s390x --- cpuid_zarch.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..e0d9221f3 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,15 +29,18 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) @@ -62,6 +65,10 @@ int detect(void) if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13; + /* detect z14, but fall back to z13 */ + if (strstr(p, "3906")) return CPU_Z13; + if (strstr(p, "3907")) return CPU_Z13; + return CPU_GENERIC; } @@ -107,5 +114,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } From fd42ca462d2df0eece73b26865fa55f7bfa07e53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 19:35:16 +0200 Subject: [PATCH 176/935] Combo of default pre-0.3.1 memory.c and band-aided version of PR1739 --- driver/others/memory.c | 1725 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 1606 insertions(+), 119 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 98bcfb216..6bca1e11f 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -72,6 +72,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //#undef DEBUG #include "common.h" + +#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || __GLIBC_PREREQ(2,20)) +#warning "using tls version of memory.c" #include #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) @@ -108,6 +111,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#ifdef OS_HAIKU +#include +#endif + #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include @@ -139,14 +146,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif -#ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP -#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) -#else -#define BUFFERS_PER_THREAD NUM_BUFFERS -#endif -#endif - #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -238,6 +237,14 @@ int get_num_procs(void) { } #endif +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -363,7 +370,7 @@ int blas_get_cpu_number(void){ #endif // blas_goto_num = 0; -#ifndef USE_OPENMP +#ifndef USE_OPENMP_UNUSED blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -420,10 +427,8 @@ int openblas_get_num_threads(void) { int hugetlb_allocated = 0; #if defined(OS_WINDOWS) -#define THREAD_LOCAL __declspec(thread) #define LIKELY_ONE(x) (x) #else -#define THREAD_LOCAL __thread #define LIKELY_ONE(x) (__builtin_expect(x, 1)) #endif @@ -459,62 +464,15 @@ struct alloc_t { for an auxiliary tracking structure. */ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); -/* Clang supports TLS from version 2.8 */ -#if defined(__clang__) && __clang_major__ > 2 || \ - (__clang_minor__ == 2 || __clang_minor__ == 8) -#define HAS_COMPILER_TLS -#endif - -/* GCC supports TLS from version 4.1 */ -#if !defined(__clang__) && defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) -#define HAS_COMPILER_TLS -#endif - -/* MSVC supports TLS from version 2005 */ -#if defined(_MSC_VER) && _MSC_VER >= 1400 -#define HAS_COMPILER_TLS -#endif - -/* Versions of XCode before 8 did not properly support TLS */ -#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 -#undef HAS_COMPILER_TLS -#endif - -/* Android NDK's before version 12b did not support TLS */ -#if defined(__ANDROID__) && defined(__clang__) -#if __has_include() -#include -#endif -#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ - defined(__NDK_MINOR__) && \ - ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) -#undef HAS_COMPILER_TLS -#endif -#endif - -/* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP) -/* This is the number of threads than can be spawned by the server, which is the - server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 -static int next_memory_table_pos = 0; -# if defined(HAS_COMPILER_TLS) -/* Use compiler generated thread-local-storage */ -static int THREAD_LOCAL local_memory_table_pos = 0; +#if defined(SMP) +# if defined(OS_WINDOWS) +static DWORD local_storage_key = 0; +DWORD lsk; # else -/* Use system-dependent thread-local-storage */ -# if defined(OS_WINDOWS) -static DWORD local_storage_key; -# else -static pthread_key_t local_storage_key; -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#else -/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ -# define MAX_ALLOCATING_THREADS 1 -#endif /* defined(SMP) && !defined(USE_OPENMP) */ -static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; +static pthread_key_t local_storage_key = 0; +pthread_key_t lsk; +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -530,34 +488,54 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t key_lock = 0; +#else +static BLASULONG key_lock = 0UL; +#endif + /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP) -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); -# else - int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ - if (!local_memory_table_pos) { - LOCK_COMMAND(&alloc_lock); - local_memory_table_pos = next_memory_table_pos++; - if (next_memory_table_pos > MAX_ALLOCATING_THREADS) - printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); - UNLOCK_COMMAND(&alloc_lock); -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); -# else - pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ - } - return local_memory_table[local_memory_table_pos]; +#if defined(SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (!lsk) { + blas_memory_init(); + } +# if defined(OS_WINDOWS) + struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key); +# else + struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ +#else + static struct alloc_t ** local_memory_table = NULL; +#endif /* defined(SMP) */ +#if defined (SMP) +LOCK_COMMAND(&key_lock); +lsk=local_storage_key; +UNLOCK_COMMAND(&key_lock); + if (lsk && !local_memory_table) { #else - return local_memory_table[0]; -#endif /* defined(SMP) && !defined(USE_OPENMP) */ + if (!local_memory_table) { +#endif /* defined(SMP) */ + local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS); + memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS); +#if defined(SMP) +# if defined(OS_WINDOWS) +LOCK_COMMAND(&key_lock); + TlsSetValue(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# else +LOCK_COMMAND(&key_lock); + pthread_setspecific(local_storage_key, (void*)local_memory_table); +UNLOCK_COMMAND(&key_lock); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ + } + return local_memory_table; } #ifdef ALLOC_MMAP @@ -637,7 +615,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { static void *alloc_mmap(void *address){ void *map_address, *best_address; - BLASULONG best, start, current; + BLASULONG best, start, current, original; BLASULONG allocsize; if (address){ @@ -685,8 +663,9 @@ static void *alloc_mmap(void *address){ start = (BLASULONG)map_address; current = (SCALING - 1) * allocation_block_size; + original = current; - while(current > 0) { + while(current > 0 && current <= original) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; @@ -1056,18 +1035,29 @@ static volatile int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ + static void blas_memory_cleanup(void* ptr){ + if (ptr) { + struct alloc_t ** table = (struct alloc_t **)ptr; + int pos; + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + struct alloc_t *alloc_info = table[pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + table[pos] = (void *)0; + } + } + free(table); + } +} + static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP) - next_memory_table_pos = 0; -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - local_storage_key = ::TlsAlloc(); -# else - pthread_key_create(&local_storage_key, NULL); -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#endif /* defined(SMP) && !defined(USE_OPENMP) */ - memset(local_memory_table, 0, sizeof(local_memory_table)); +#if defined(SMP) +# if defined(OS_WINDOWS) + local_storage_key = TlsAlloc(); +# else + pthread_key_create(&local_storage_key, blas_memory_cleanup); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ } void *blas_memory_alloc(int procpos){ @@ -1105,7 +1095,16 @@ void *blas_memory_alloc(int procpos){ struct alloc_t * alloc_info; struct alloc_t ** alloc_table; + +#if defined(SMP) && !defined(USE_OPENMP) +int mi; +LOCK_COMMAND(&alloc_lock); +mi=memory_initialized; +UNLOCK_COMMAND(&alloc_lock); + if (!LIKELY_ONE(mi)) { +#else if (!LIKELY_ONE(memory_initialized)) { +#endif #if defined(SMP) && !defined(USE_OPENMP) /* Only allow a single thread to initialize memory system */ LOCK_COMMAND(&alloc_lock); @@ -1149,7 +1148,7 @@ void *blas_memory_alloc(int procpos){ if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; - } while (position < BUFFERS_PER_THREAD); + } while (position < NUM_BUFFERS); goto error; @@ -1247,7 +1246,7 @@ void blas_memory_free(void *buffer){ #ifdef DEBUG alloc_table = get_memory_table(); - for (position = 0; position < BUFFERS_PER_THREAD; position++){ + for (position = 0; position < NUM_BUFFERS; position++){ if (alloc_table[position]) { printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); } @@ -1267,22 +1266,14 @@ void blas_memory_free_nolock(void * map_address) { } void blas_shutdown(void){ - - int pos, thread; - #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - - for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - struct alloc_t *alloc_info = local_memory_table[thread][pos]; - if (alloc_info) { - alloc_info->release_func(alloc_info); - alloc_info = (void *)0; - } - } - } +#ifdef SMP + /* Only cleanupIf we were built for threading and TLS was initialized */ + if (local_storage_key) +#endif + blas_memory_cleanup((void*)get_memory_table()); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1503,6 +1494,9 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: +#if defined(SMP) + blas_memory_cleanup((void*)get_memory_table()); +#endif break; case DLL_PROCESS_DETACH: gotoblas_quit(); @@ -1573,3 +1567,1496 @@ void gotoblas_dummy_for_PGI(void) { #endif } #endif + +#else +#include + +#ifdef OS_WINDOWS +#define ALLOC_WINDOWS +#ifndef MEM_LARGE_PAGES +#define MEM_LARGE_PAGES 0x20000000 +#endif +#else +#define ALLOC_MMAP +#define ALLOC_MALLOC +#endif + +#include +#include +#include + +#ifndef OS_WINDOWS +#include +#ifndef NO_SYSV_IPC +#include +#endif +#include +#endif + +#include + +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#include +#include +#endif + +#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#include +#include +#endif + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef OS_LINUX + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#endif + +#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP) +#define NO_WARMUP +#endif + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#ifndef FIXED_PAGESIZE +#define FIXED_PAGESIZE 4096 +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#if defined(_MSC_VER) && !defined(__clang__) +#define CONSTRUCTOR __cdecl +#define DESTRUCTOR __cdecl +#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) +#else +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) +#endif + +#ifdef DYNAMIC_ARCH +gotoblas_t *gotoblas = NULL; +#endif +extern void openblas_warning(int verbose, const char * msg); + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + static int nums = 0; +cpu_set_t *cpusetp; +size_t size; +int ret; +int i,n; + + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); +#if !defined(OS_LINUX) + return nums; +#endif + +#if !defined(__GLIBC_PREREQ) + return nums; +#else + #if !__GLIBC_PREREQ(2, 3) + return nums; + #endif + + #if !__GLIBC_PREREQ(2, 7) + ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + if (ret!=0) return nums; + n=0; + #if !__GLIBC_PREREQ(2, 6) + for (i=0;i 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + +int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else + // init blas_cpu_number if needed + blas_get_cpu_number(); + return blas_cpu_number; +#endif +} + +struct release_t { + void *address; + void (*func)(struct release_t *); + long attr; +}; + +int hugetlb_allocated = 0; + +static struct release_t release_info[NUM_BUFFERS]; +static int release_pos = 0; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) +static int hot_alloc = 0; +#endif + +/* Global lock for memory allocation */ + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t alloc_lock = 0; +#else +static BLASULONG alloc_lock = 0UL; +#endif + +#ifdef ALLOC_MMAP + +static void alloc_mmap_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : munmap failed\n"); + } +} + + + +#ifdef NO_WARMUP + +static void *alloc_mmap(void *address){ + void *map_address; + + if (address){ + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + } else { + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + } + + if (map_address != (void *)-1) { + LOCK_COMMAND(&alloc_lock); + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + UNLOCK_COMMAND(&alloc_lock); + } + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + return map_address; +} + +#else + +#define BENCH_ITERATION 4 +#define SCALING 2 + +static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { + + BLASULONG original, *p; + BLASULONG start, stop, min; + int iter, i, count; + + min = (BLASULONG)-1; + + original = *(BLASULONG *)(address + size - PAGESIZE); + + *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address; + + for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { + + p = (BLASULONG *)address; + + count = size / PAGESIZE; + + start = rpcc(); + + for (i = 0; i < count; i ++) { + p = (BLASULONG *)(*p); + } + + stop = rpcc(); + + if (min > stop - start) min = stop - start; + } + + *(BLASULONG *)(address + size - PAGESIZE + 0) = original; + *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; + + return min; +} + +static void *alloc_mmap(void *address){ + void *map_address, *best_address; + BLASULONG best, start, current; + BLASULONG allocsize; + + if (address){ + /* Just give up use advanced operation */ + map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc == 0) { + map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#endif + + map_address = mmap(NULL, BUFFER_SIZE * SCALING, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + + if (map_address != (void *)-1) { + +#ifdef OS_LINUX +#ifdef DEBUG + int ret=0; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } + +#else + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif +#endif + + + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; + + while(current > 0) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } + + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + + start = (BLASULONG)map_address; + + best = (BLASULONG)-1; + best_address = map_address; + + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + + current = run_bench(start, allocsize); + + if (best > current) { + best = current; + best_address = (void *)start; + } + + start += PAGESIZE; + + } + + if ((BLASULONG)best_address > (BLASULONG)map_address) + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + + munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + + map_address = best_address; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + hot_alloc = 2; +#endif + } + } +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + LOCK_COMMAND(&alloc_lock); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + UNLOCK_COMMAND(&alloc_lock); + + return map_address; +} + +#endif + +#endif + + +#ifdef ALLOC_MALLOC + +static void alloc_malloc_free(struct release_t *release){ + + free(release -> address); + +} + +static void *alloc_malloc(void *address){ + + void *map_address; + + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_malloc_free; + release_pos ++; + } + + return map_address; + +} + +#endif + +#ifdef ALLOC_QALLOC + +void *qalloc(int flags, size_t bytes); +void *qfree (void *address); + +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 + +static void alloc_qalloc_free(struct release_t *release){ + + qfree(release -> address); + +} + +static void *alloc_qalloc(void *address){ + void *map_address; + + map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_qalloc_free; + release_pos ++; + } + + return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); +} + +#endif + +#ifdef ALLOC_WINDOWS + +static void alloc_windows_free(struct release_t *release){ + + VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + +} + +static void *alloc_windows(void *address){ + void *map_address; + + map_address = VirtualAlloc(address, + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_windows_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_DEVICEDRIVER +#ifndef DEVICEDRIVER_NAME +#define DEVICEDRIVER_NAME "/dev/mapper" +#endif + +static void alloc_devicedirver_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : Bugphysarea unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : Bugphysarea close failed.\n"); + } + +} + +static void *alloc_devicedirver(void *address){ + + int fd; + void *map_address; + + if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) { + + return (void *)-1; + + } + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_devicedirver_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_SHM + +static void alloc_shm_free(struct release_t *release){ + + if (shmdt(release -> address)) { + printf("OpenBLAS : Shared memory unmap failed.\n"); + } +} + +static void *alloc_shm(void *address){ + void *map_address; + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + + map_address = (void *)shmat(shmid, address, 0); + + if (map_address != (void *)-1){ + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + shmctl(shmid, IPC_RMID, 0); + + release_info[release_pos].address = map_address; + release_info[release_pos].attr = shmid; + release_info[release_pos].func = alloc_shm_free; + release_pos ++; + } + + return map_address; +} + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + +static void alloc_hugetlb_free(struct release_t *release){ + +#if defined(OS_LINUX) || defined(OS_AIX) + if (shmdt(release -> address)) { + printf("OpenBLAS : Hugepage unmap failed.\n"); + } +#endif + +#ifdef __sun__ + + munmap(release -> address, BUFFER_SIZE); + +#endif + +#ifdef OS_WINDOWS + + VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + +#endif + +} + +static void *alloc_hugetlb(void *address){ + + void *map_address = (void *)-1; + +#if defined(OS_LINUX) || defined(OS_AIX) + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, +#ifdef OS_LINUX + SHM_HUGETLB | +#endif +#ifdef OS_AIX + SHM_LGPAGE | SHM_PIN | +#endif + IPC_CREAT | SHM_R | SHM_W); + + if (shmid != -1) { + map_address = (void *)shmat(shmid, address, SHM_RND); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + if (map_address != (void *)-1){ + shmctl(shmid, IPC_RMID, 0); + } + } +#endif + +#ifdef __sun__ + struct memcntl_mha mha; + + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; + mha.mha_flags = 0; + mha.mha_pagesize = HUGE_PAGESIZE; + memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); + + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); +#endif + +#ifdef OS_WINDOWS + + HANDLE hToken; + TOKEN_PRIVILEGES tp; + + if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) { + CloseHandle(hToken); + return (void*)-1; + } + + map_address = (void *)VirtualAlloc(address, + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + tp.Privileges[0].Attributes = 0; + AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); + + if (map_address == (void *)NULL) map_address = (void *)-1; + +#endif + + if (map_address != (void *)-1){ + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_hugetlb_free; + release_pos ++; + } + + return map_address; +} +#endif + +#endif + +#ifdef ALLOC_HUGETLBFILE + +static int hugetlb_pid = 0; + +static void alloc_hugetlbfile_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("OpenBLAS : HugeTLBfs unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("OpenBLAS : HugeTLBfs close failed.\n"); + } +} + +static void *alloc_hugetlbfile(void *address){ + + void *map_address = (void *)-1; + int fd; + char filename[64]; + + if (!hugetlb_pid) hugetlb_pid = getpid(); + + sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid); + + if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) { + return (void *)-1; + } + + unlink(filename); + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_hugetlbfile_free; + release_pos ++; + } + + return map_address; +} +#endif + + +#ifdef SEEK_ADDRESS +static BLASULONG base_address = 0UL; +#else +static BLASULONG base_address = BASE_ADDRESS; +#endif + +static volatile struct { + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif + +} memory[NUM_BUFFERS]; + +static int memory_initialized = 0; + +/* Memory allocation routine */ +/* procpos ... indicates where it comes from */ +/* 0 : Level 3 functions */ +/* 1 : Level 2 functions */ +/* 2 : Thread */ + +void *blas_memory_alloc(int procpos){ + + int position; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int mypos; +#endif + + void *map_address; + + void *(*memoryalloc[])(void *address) = { +#ifdef ALLOC_DEVICEDRIVER + alloc_devicedirver, +#endif +/* Hugetlb implicitly assumes ALLOC_SHM */ +#ifdef ALLOC_SHM + alloc_shm, +#endif +#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) + alloc_hugetlb, +#endif +#ifdef ALLOC_MMAP + alloc_mmap, +#endif +#ifdef ALLOC_QALLOC + alloc_qalloc, +#endif +#ifdef ALLOC_WINDOWS + alloc_windows, +#endif +#ifdef ALLOC_MALLOC + alloc_malloc, +#endif + NULL, + }; + void *(**func)(void *address); + LOCK_COMMAND(&alloc_lock); + + if (!memory_initialized) { + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + for (position = 0; position < NUM_BUFFERS; position ++){ + memory[position].addr = (void *)0; + memory[position].pos = -1; + memory[position].used = 0; + memory[position].lock = 0; + } +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#ifdef SMP + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#ifndef DYNAMIC_ARCH + blas_set_parameter(); +#endif +#endif + + memory_initialized = 1; + + } + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Alloc Start ...\n"); +#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + mypos = WhereAmI(); + + position = mypos; + while (position >= NUM_BUFFERS) position >>= 1; + + do { + if (!memory[position].used && (memory[position].pos == mypos)) { + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + } + + position ++; + + } while (position < NUM_BUFFERS); + + +#endif + + position = 0; + + do { +/* if (!memory[position].used) { */ + LOCK_COMMAND(&alloc_lock); +/* blas_lock(&memory[position].lock);*/ + + if (!memory[position].used) goto allocation; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ +/* } */ + + position ++; + + } while (position < NUM_BUFFERS); + + goto error; + + allocation : + +#ifdef DEBUG + printf(" Position -> %d\n", position); +#endif + + memory[position].used = 1; + + UNLOCK_COMMAND(&alloc_lock); +/* blas_unlock(&memory[position].lock);*/ + + if (!memory[position].addr) { + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + + LOCK_COMMAND(&alloc_lock); + memory[position].addr = map_address; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); +#endif + } + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (memory[position].pos == -1) memory[position].pos = mypos; + +#endif + +#ifdef DYNAMIC_ARCH + + if (memory_initialized == 1) { + + LOCK_COMMAND(&alloc_lock); + + if (memory_initialized == 1) { + + if (!gotoblas) gotoblas_dynamic_init(); + + memory_initialized = 2; + } + + UNLOCK_COMMAND(&alloc_lock); + + } +#endif + + +#ifdef DEBUG + printf("Mapped : %p %3d\n\n", + (void *)memory[position].addr, position); +#endif + + return (void *)memory[position].addr; + + error: + printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + + return NULL; +} + +void blas_memory_free(void *free_area){ + + int position; + +#ifdef DEBUG + printf("Unmapped Start : %p ...\n", free_area); +#endif + + position = 0; + LOCK_COMMAND(&alloc_lock); + + while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + position++; + + if (memory[position].addr != free_area) goto error; + +#ifdef DEBUG + printf(" Position : %d\n", position); +#endif + + // arm: ensure all writes are finished before other thread takes this memory + WMB; + + memory[position].used = 0; + UNLOCK_COMMAND(&alloc_lock); + +#ifdef DEBUG + printf("Unmap Succeeded.\n\n"); +#endif + + return; + + error: + printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); + +#ifdef DEBUG + for (position = 0; position < NUM_BUFFERS; position++) + printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); +#endif + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +void *blas_memory_alloc_nolock(int unused) { + void *map_address; + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + return map_address; +} + +void blas_memory_free_nolock(void * map_address) { + free(map_address); +} + +void blas_shutdown(void){ + + int pos; + +#ifdef SMP + BLASFUNC(blas_thread_shutdown)(); +#endif + + LOCK_COMMAND(&alloc_lock); + + for (pos = 0; pos < release_pos; pos ++) { + release_info[pos].func(&release_info[pos]); + } + +#ifdef SEEK_ADDRESS + base_address = 0UL; +#else + base_address = BASE_ADDRESS; +#endif + + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + memory[pos].addr = (void *)0; + memory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + memory[pos].pos = -1; +#endif + memory[pos].lock = 0; + } + + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + +#ifdef SMP +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t init_lock = 0; +#else +static BLASULONG init_lock = 0UL; +#endif +#endif + +static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + void *sa, void *sb, BLASLONG pos) { + +#if !defined(ARCH_POWER) && !defined(ARCH_SPARC) + + size_t size; + BLASULONG buffer; + + size = BUFFER_SIZE - PAGESIZE; + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc != 2) { +#endif + +#ifdef SMP + LOCK_COMMAND(&init_lock); +#endif + + while (size > 0) { + *(int *)buffer = size; + buffer += PAGESIZE; + size -= PAGESIZE; + } + +#ifdef SMP + UNLOCK_COMMAND(&init_lock); +#endif + + size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + + while (size > 0) { + *(int *)buffer = size; + buffer += 64; + size -= 64; + } + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + +#endif +} + +#ifdef SMP + +static void _init_thread_memory(void *buffer) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int num_cpu; + + for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) { + + blas_queue_init(&queue[num_cpu]); + queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL; + queue[num_cpu].routine = &_touch_memory; + queue[num_cpu].args = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + } + + queue[num_cpu - 1].next = NULL; + queue[0].sa = buffer; + + exec_blas(num_cpu, queue); + +} +#endif + +static void gotoblas_memory_init(void) { + + void *buffer; + + hot_alloc = 1; + + buffer = (void *)blas_memory_alloc(0); + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif + + _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); + +#else + + _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); + +#endif + + blas_memory_free(buffer); +} +#endif + +/* Initialization for all function; this function should be called before main */ + +static int gotoblas_initialized = 0; +extern void openblas_read_env(); + +void CONSTRUCTOR gotoblas_init(void) { + + if (gotoblas_initialized) return; + +#ifdef SMP + openblas_fork_handler(); +#endif + + openblas_read_env(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + gotoblas_memory_init(); +#endif + +//#if defined(OS_LINUX) +#if 0 + struct rlimit curlimit; + if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) + { + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } + } +#endif + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_init(); +#endif + + gotoblas_initialized = 1; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +void DESTRUCTOR gotoblas_quit(void) { + + if (gotoblas_initialized == 0) return; + + blas_shutdown(); + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_quit(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_quit(); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_quit(); +#endif + + gotoblas_initialized = 0; + +#ifdef PROFILE + moncontrol (1); +#endif +} + +#if defined(_MSC_VER) && !defined(__clang__) +BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +{ + switch (ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + default: + break; + } + return TRUE; +} + +/* + This is to allow static linking. + Code adapted from Google performance tools: + https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc + Reference: + https://sourceware.org/ml/pthreads-win32/2008/msg00028.html + http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp +*/ +static int on_process_term(void) +{ + gotoblas_quit(); + return 0; +} +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XLB") +#else +#pragma data_seg(".CRT$XLB") +#endif +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif + +#ifdef _WIN64 +#pragma const_seg(".CRT$XTU") +#else +#pragma data_seg(".CRT$XTU") +#endif +static int(*p_process_term)(void) = on_process_term; +#ifdef _WIN64 +#pragma const_seg() +#else +#pragma data_seg() +#endif +#endif + +#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) +/* Don't call me; this is just work around for PGI / Sun bug */ +void gotoblas_dummy_for_PGI(void) { + + gotoblas_init(); + gotoblas_quit(); + +#if 0 + asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); + asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); +#else + asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); + asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); +#endif +} +#endif + +#endif From 2a589c4b286b4ab2f117efdc501d2facc547a401 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 19:36:12 +0200 Subject: [PATCH 177/935] Add USE_TLS option to switch between old and new memory.c --- cmake/system.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 48e8f75bc..18b2c3b87 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR) set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") endif () +if (USE_TLS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS") +endif () + # Only for development # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") From 2caa2210bbfb5b69c3758b8158bb0bad4a0f5e58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 19:37:11 +0200 Subject: [PATCH 178/935] Add USE_TLS option to choose between old and new implementation of memory.c --- Makefile.rule | 10 ++++++++-- Makefile.system | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 649aabe70..4b815d7a8 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.1.dev +VERSION = 0.3.3.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 # BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. -# USE_SIMPLE_THREADED_LEVEL3 = 1 +USE_SIMPLE_THREADED_LEVEL3 = 1 + +# If you want to use the new, still somewhat experimental code that uses +# thread-local storage instead of a central memory buffer in memory.c +# Note that if your system uses GLIBC, it needs to have at least glibc 2.21 +# for this to work. +USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran # compiler supports this. It's safe to keep comment it out if you diff --git a/Makefile.system b/Makefile.system index 4712d9525..2123af204 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif +ifdef USE_TLS +CCOMMON_OPT += -DUSE_TLS +endif + ifndef SYMBOLPREFIX SYMBOLPREFIX = endif From 5991d1a6cd9d7340d2ea7e393a00eab8e232394f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Aug 2018 22:12:40 +0200 Subject: [PATCH 179/935] Update memory.c --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1d408fcda..7688937e5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || __GLIBC_PREREQ(2,20)) +#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || (defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2,20))) #warning "using tls version of memory.c" #include From b902a409863f14e3334ae79265fa353f21f98ed7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 26 Aug 2018 11:18:02 +0200 Subject: [PATCH 180/935] Rewrite glibc version check --- driver/others/memory.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 7688937e5..b2e154e8b 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,8 +73,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(USE_TLS) && ( !defined(__GLIBC_PREREQ) || (defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2,20))) -#warning "using tls version of memory.c" +#if defined(USE_TLS) +#define COMPILE_TLS +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2,20)) +#undef COMPILE_TLS +#endif +#endif +#endif + +#if defined(COMPILE_TLS) + #include #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) From b55690a659fbc1b9cd267da26e2e54e3bdf7be52 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 26 Aug 2018 11:31:07 +0200 Subject: [PATCH 181/935] typo fix --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index b2e154e8b..9d4ab19f5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(USE_TLS) #define COMPILE_TLS #if defined(__GLIBC_PREREQ) -#if !__GLIBC_PREREQ(2,20)) +#if !__GLIBC_PREREQ(2,20) #undef COMPILE_TLS #endif #endif From 9e917b16dbba25c013b3fa32d22476eb4ed15541 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Aug 2018 21:11:54 +0200 Subject: [PATCH 182/935] Fix missing replacements of ILAENV by ILAENV_2STAGE (lapack PR 272) This could cause spurious "parameter has an illegal value" errors in DSYEVR and related routines, see https://github.com/Reference-LAPACK/lapack/issues/262 --- lapack-netlib/SRC/chetrd_hb2st.F | 10 +++++----- lapack-netlib/SRC/chetrd_he2hb.f | 6 +++--- lapack-netlib/SRC/dsytrd_sb2st.F | 10 +++++----- lapack-netlib/SRC/dsytrd_sy2sb.f | 6 +++--- lapack-netlib/SRC/ssytrd_sb2st.F | 10 +++++----- lapack-netlib/SRC/ssytrd_sy2sb.f | 6 +++--- lapack-netlib/SRC/zhetrd_hb2st.F | 10 +++++----- lapack-netlib/SRC/zhetrd_he2hb.f | 6 +++--- 8 files changed, 32 insertions(+), 32 deletions(-) diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 91806bb1d..43da45640 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -280,8 +280,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -297,9 +297,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index fd8c3fbe0..e334532fe 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index 4ca0507e4..4d81fe226 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -277,8 +277,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -294,9 +294,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 85337f792..e0a5debc5 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index bd645327e..0df1173e4 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -277,8 +277,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -294,9 +294,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index c01fe3598..272876700 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 508afca06..86122cccc 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -280,8 +280,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -297,9 +297,9 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e35578b42..e33bf4b2b 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -285,8 +285,8 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV + INTEGER ILAENV2STAGE + EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. * @@ -296,7 +296,7 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 From f3fd44a731c1997b1d79d4d16abc25d78dce88a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Aug 2018 21:34:07 +0200 Subject: [PATCH 183/935] Set USE_TRMM for all ZARCH variants to fix TRMM faults with zarch-generic fixes #1743 --- kernel/Makefile.L3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index b37e536ef..9258f216d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8) USE_TRMM = 1 endif -ifeq ($(CORE), Z13) +ifeq ($(ARCH), zarch) USE_TRMM = 1 endif From e17f969fa0f7e8c9f5525577198a17fd7a9da21a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Aug 2018 13:28:46 +0200 Subject: [PATCH 184/935] Assume cross-compilation if host and target os differ fixes 1674 --- c_check | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/c_check b/c_check index 3831d7aa3..64009504c 100644 --- a/c_check +++ b/c_check @@ -223,7 +223,6 @@ $data =~ /globl\s([_\.]*)(.*)/; $need_fu = $1; $cross = 0; -$cross = 1 if ($os ne $hostos); if ($architecture ne $hostarch) { $cross = 1; @@ -231,6 +230,8 @@ if ($architecture ne $hostarch) { $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); } +$cross = 1 if ($os ne $hostos); + $openmp = "" if $ENV{USE_OPENMP} != 1; $linker_L = ""; From 3197f86762f14753517dfebd7f8665cb6bf6c344 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Aug 2018 23:43:14 +0200 Subject: [PATCH 185/935] Version 0.3.3 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20ce02e87..0f985455b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 3.dev) +set(OpenBLAS_PATCH_VERSION 3) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From f0563f14bab6afcb3263a4710087c704bddfbb98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Aug 2018 23:43:57 +0200 Subject: [PATCH 186/935] Version 0.3.3 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 4b815d7a8..6457532c8 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.3.dev +VERSION = 0.3.3 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From fd8d1868a126bb9f12bbc43b36ee30d1ba943fbb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:07:48 +0200 Subject: [PATCH 187/935] Updates for 0.3.3 --- Changelog.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 33dcacc51..faecd82e3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,31 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.3 +31-Aug-2018 + +common: + * thread memory allocation has been switched back to the method + used before version 0.3.1 due to unexpected problems caused by + the new code under some circumstances. A new compile-time option + USE_TLS has been added to enable the new code, and it is hoped + that this can become the default again in the next version. + * LAPAck PR272 has been integrated, which fixes spurious errors + in DSYEVR and related functions caused by missing conversion + from ILAENV to ILAENV_2STAGE in several _2stage routines. + * the cmake-generated OpenBLASConfig.cmake now uses correct case + for the name of the library + * added support for Haiku OS + +x86_64: + * added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY, + DSCAL, DGEMVN and DSYMVL + * added a workaround for a cygwin issue that prevented compilation + of AVX512 code + +IBM Z: + * added autodetection of Z14 + * fixed TRMM errors in the generic target + ==================================================================== Version 0.3.2 30-Jul-2018 From 2982ce505d35bde04013b3e1cf4755954901efe5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:18:37 +0200 Subject: [PATCH 188/935] Update version to 0.3.4.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20ce02e87..97c3b7777 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 3.dev) +set(OpenBLAS_PATCH_VERSION 4.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From dbfd7524cd94fe15930ed2f78b7789f15b22fec0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:19:21 +0200 Subject: [PATCH 189/935] Update version to 0.3.4.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 4b815d7a8..25ed0357d 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.3.dev +VERSION = 0.3.4.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 9e2bb0c6417ade4a9cf4a5787e0eb9fd491e8fc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Aug 2018 00:21:13 +0200 Subject: [PATCH 190/935] Update with the changes from 0.3.3 --- Changelog.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 33dcacc51..faecd82e3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,31 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.3 +31-Aug-2018 + +common: + * thread memory allocation has been switched back to the method + used before version 0.3.1 due to unexpected problems caused by + the new code under some circumstances. A new compile-time option + USE_TLS has been added to enable the new code, and it is hoped + that this can become the default again in the next version. + * LAPAck PR272 has been integrated, which fixes spurious errors + in DSYEVR and related functions caused by missing conversion + from ILAENV to ILAENV_2STAGE in several _2stage routines. + * the cmake-generated OpenBLASConfig.cmake now uses correct case + for the name of the library + * added support for Haiku OS + +x86_64: + * added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY, + DSCAL, DGEMVN and DSYMVL + * added a workaround for a cygwin issue that prevented compilation + of AVX512 code + +IBM Z: + * added autodetection of Z14 + * fixed TRMM errors in the generic target + ==================================================================== Version 0.3.2 30-Jul-2018 From a4bd41e9f2bbebfe2453de7a43194b185fd72da5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 4 Sep 2018 10:51:19 +0200 Subject: [PATCH 191/935] Fix paths to C kernels for nrm2 --- kernel/arm64/KERNEL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index aeccfbf4c..f936cdf47 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -1,17 +1,17 @@ ifndef SNRM2KERNEL -SNRM2KERNEL = nrm2.c +SNRM2KERNEL = ../arm/nrm2.c endif ifndef DNRM2KERNEL -DNRM2KERNEL = nrm2.c +DNRM2KERNEL = ../arm/nrm2.c endif ifndef CNRM2KERNEL -CNRM2KERNEL = znrm2.c +CNRM2KERNEL = ../arm/znrm2.c endif ifndef ZNRM2KERNEL -ZNRM2KERNEL = znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c endif ifndef SCABS_KERNEL From 1cb7b9015ebd49e1cbf09eb289b7a6d5bba5ea31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 4 Sep 2018 11:06:51 +0200 Subject: [PATCH 192/935] Conditional compilation of assembly files that IOS does not like --- kernel/arm64/KERNEL.ARMV8 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index d05754628..4c6d6fb71 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -51,10 +51,12 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S +ifneq ($(OS_DARWIN)$(CROSS),11) SNRM2KERNEL = nrm2.S DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S +endif SROTKERNEL = rot.S DROTKERNEL = rot.S @@ -86,7 +88,11 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ifneq ($(OS_DARWIN)$(CROSS),11) SGEMMKERNEL = sgemm_kernel_4x4.S +else +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +endif SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o From 8aeab0601e9787698a2af16e21bbaba9621183dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Sep 2018 16:39:52 +0200 Subject: [PATCH 193/935] Follow netlib renaming/aliasing CBLAS_ORDER to CBLAS_LAYOUT fixes #1754 --- cblas.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cblas.h b/cblas.h index 6461f4209..347089e5b 100644 --- a/cblas.h +++ b/cblas.h @@ -46,12 +46,13 @@ int openblas_get_parallel(void); #define CBLAS_INDEX size_t -typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_LAYOUT {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT; typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; - +typedef CBLAS_LAYOUT CBLAS_ORDER; + float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); From b57af9379270753ef69f4934ed7c57ee89f5833b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Sep 2018 16:54:31 +0200 Subject: [PATCH 194/935] just make CBLAS_LAYOUT an alias of the existing CBLAS_ORDER to avoid having to change all instances of enum CBLAS_ORDER in this file --- cblas.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cblas.h b/cblas.h index 347089e5b..d340a2037 100644 --- a/cblas.h +++ b/cblas.h @@ -46,12 +46,12 @@ int openblas_get_parallel(void); #define CBLAS_INDEX size_t -typedef enum CBLAS_LAYOUT {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT; +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; -typedef CBLAS_LAYOUT CBLAS_ORDER; +typedef CBLAS_ORDER CBLAS_LAYOUT; float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); From 4cf7315a5d5c512b1f38c523d4cd28c399b2000d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Sep 2018 21:41:54 +0200 Subject: [PATCH 195/935] Adjust ARMV8 SGEMM unrolling when using the C fallback kernel_2x2 for IOS --- param.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/param.h b/param.h index cfa4bba5c..ded9fe0b8 100644 --- a/param.h +++ b/param.h @@ -2590,8 +2590,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#if defined(OS_DARWIN) && defined(CROSS) +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL N 2 +#else #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 +#endif #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 From 1e531701b7ab24a069ec5e549fc08eaca49050a1 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 9 Sep 2018 16:52:25 +0200 Subject: [PATCH 196/935] fix small typo --- kernel/generic/trmm_lncopy_16.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/generic/trmm_lncopy_16.c b/kernel/generic/trmm_lncopy_16.c index 4c0a76cbd..0f4b0a9f7 100644 --- a/kernel/generic/trmm_lncopy_16.c +++ b/kernel/generic/trmm_lncopy_16.c @@ -661,7 +661,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; - b[ 11] = ZERO; + b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; From 58363542e73998250a6829e8aa4f4d4e8f94337f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Sep 2018 10:51:17 +0200 Subject: [PATCH 197/935] remove unused variable ldb_t Copied from Reference-LAPACK PR283 --- lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c index 2cc7b9ad2..dbd6e9049 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_dsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); double* a_t = NULL; double* tb_t = NULL; /* Check leading dimension(s) */ From 5cf090f516e7ea48316901fb3e1ea4ab086db25b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Sep 2018 10:52:30 +0200 Subject: [PATCH 198/935] remove unused variable ldb_t Copied from Reference-LAPACK PR283 --- lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c index 5b8010d9e..b9ba0fb56 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); lapack_complex_double* a_t = NULL; lapack_complex_double* tb_t = NULL; /* Check leading dimension(s) */ From 094f8c3b579468636cada39ead49c43532b91b62 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Sep 2018 10:53:47 +0200 Subject: [PATCH 199/935] remove unused variable ldb_t Copied from Reference-LAPACK PR283 --- lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c index f91c42257..db27e2873 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_zsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); lapack_complex_double* a_t = NULL; lapack_complex_double* tb_t = NULL; /* Check leading dimension(s) */ From 30f5a69ab858c0c110f8e188d924d5fb117d3f81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Sep 2018 14:23:31 +0200 Subject: [PATCH 200/935] Add explicit cast to silence a warning for #1710 --- interface/lapack/laswp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c index ebeb103e7..0dde33ae3 100644 --- a/interface/lapack/laswp.c +++ b/interface/lapack/laswp.c @@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, - laswp[flag], nthreads); + (int(*)())laswp[flag], nthreads); } #endif From f3c262156e88b204731c46221400d77c7b4f0c49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Sep 2018 14:24:29 +0200 Subject: [PATCH 201/935] Add an explicit cast to silence a warning for #1710 --- interface/lapack/zlaswp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c index 31e08451d..b77a40985 100644 --- a/interface/lapack/zlaswp.c +++ b/interface/lapack/zlaswp.c @@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads); } #endif From 2349e151497dc4686413d65954d5418519dfc320 Mon Sep 17 00:00:00 2001 From: Yuri Date: Sat, 15 Sep 2018 19:59:17 -0700 Subject: [PATCH 202/935] Allow to install the 'interfare64' version concurrently with the regular version --- CMakeLists.txt | 30 ++++++++++++++++++------------ cmake/fc.cmake | 5 +++++ cmake/openblas.pc.in | 3 ++- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97c3b7777..9513488c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,6 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(OpenBLAS_LIBNAME openblas) - ####### if(MSVC) option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) @@ -43,6 +41,8 @@ message(WARNING "CMake support is experimental. This will not produce the same M include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") +set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) + set(BLASDIRS interface driver/level2 driver/level3 driver/others) if (NOT DYNAMIC_ARCH) @@ -214,11 +214,15 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} - EXPORT "OpenBLASTargets" + EXPORT "OpenBLAS${SUFFIX64}Targets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +# Install headers +set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) +set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) + message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) @@ -266,29 +270,31 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() include(FindPkgConfig QUIET) if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) + configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) + install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(PN OpenBLAS) -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") configure_package_config_file(cmake/${PN}Config.cmake.in - "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake VERSION ${${PN}_VERSION} COMPATIBILITY AnyNewerVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + RENAME ${PN}${SUFFIX64}ConfigVersion.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) -install(EXPORT "${PN}Targets" - NAMESPACE "${PN}::" +install(EXPORT "${PN}${SUFFIX64}Targets" + NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 1446a900d..38d59f956 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,6 +3,11 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. +if (INTERFACE64) + set(SUFFIX64 64) + set(SUFFIX64_UNDERSCORE _64) +endif() + if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index ca88a6d5f..df4b2ab06 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,4 +1,5 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ @@ -6,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas +Libs: -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} From b402626509070764b2c6e0302e19c7b779372fe0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Sep 2018 12:43:36 +0200 Subject: [PATCH 203/935] Do not use the new TLS code for non-threaded builds even if USE_TLS is set Workaround for #1761 as that exposed a problem in the new code (which was intended to speed up multithreaded code only anyway). --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 9d4ab19f5..e73d53fa2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(USE_TLS) +#if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS #if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2,20) From 1ad1e79062d40cc9445e5c2098e15b8c45081a75 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Sep 2018 18:03:43 +0200 Subject: [PATCH 204/935] Catch inadvertent USE_TLS=0 declaration for #1766 --- driver/others/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index e73d53fa2..0019253c0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -75,6 +75,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS + +#if USE_TLS != 1 +#undef COMPILE_TLS +#endif + #if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2,20) #undef COMPILE_TLS From 288aeea8a285da8551c465681c7b9330a5486e7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Sep 2018 18:08:31 +0200 Subject: [PATCH 205/935] Fix default settings - USE_TLS and USE_SIMPLE_THREADED_LEVEL3 should both be off --- Makefile.rule | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 25ed0357d..8c651412e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -107,13 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 # BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. -USE_SIMPLE_THREADED_LEVEL3 = 1 +# USE_SIMPLE_THREADED_LEVEL3 = 1 # If you want to use the new, still somewhat experimental code that uses # thread-local storage instead of a central memory buffer in memory.c # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 # for this to work. -USE_TLS = 1 +# USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran # compiler supports this. It's safe to keep comment it out if you From 6f77af2eef8a6ea2c5e32c66528849c319d4fb6d Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Fri, 21 Sep 2018 09:19:51 +0000 Subject: [PATCH 206/935] Add `$(LDFLAGS)` to `$(CC)` and `$(FC)` invocations within `exports/Makefile` --- exports/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 29075a9c2..3a5f77db3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -114,9 +114,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c From cf6df9464c4e30d844726e986fbb8834fcdb8dc8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Sep 2018 12:31:37 +0200 Subject: [PATCH 207/935] Document the stub status of the QUAD_PRECiSION code (#1772) * Document the stub status of the QUAD_PRECiSION code inherited from GotoBLAS2 in response to #1769 --- Makefile.rule | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 8c651412e..6522b0777 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -152,6 +152,9 @@ NO_AFFINITY = 1 # FUNCTION_PROFILE = 1 # Support for IEEE quad precision(it's *real* REAL*16)( under testing) +# This option should not be used - it is a holdover from unfinished code present +# in the original GotoBLAS2 library that may be usable as a starting point but +# is not even expected to compile in its present form. # QUAD_PRECISION = 1 # Theads are still working for a while after finishing BLAS operation From 28aa94bf4be41324a46558d979e428bb4ca19a33 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Sep 2018 14:00:15 +0200 Subject: [PATCH 208/935] Include thread numbers in failure message from blas_thread_init to aid in debugging cases like #1767 --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 1d7f570d8..6a25e2d07 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -582,7 +582,7 @@ int blas_thread_init(void){ if(ret!=0){ struct rlimit rlim; const char *msg = strerror(ret); - fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " From 7e5df34e6afede4bcdaa20866353c96ae2512052 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 25 Sep 2018 09:41:58 +0200 Subject: [PATCH 209/935] Convert fldmia/fstmia instructions to UAL syntax for clang7 fixes #1774 --- kernel/arm/asum_vfp.S | 76 +++++----- kernel/arm/axpy_vfp.S | 124 +++++++-------- kernel/arm/ccopy_vfp.S | 28 ++-- kernel/arm/cdot_vfp.S | 40 ++--- kernel/arm/cgemm_kernel_2x2_vfp.S | 44 +++--- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 64 ++++---- kernel/arm/cgemm_tcopy_2_vfp.S | 20 +-- kernel/arm/cgemv_n_vfp.S | 32 ++-- kernel/arm/cgemv_t_vfp.S | 40 ++--- kernel/arm/ctrmm_kernel_2x2_vfp.S | 32 ++-- kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 52 +++---- kernel/arm/dcopy_vfp.S | 28 ++-- kernel/arm/ddot_vfp.S | 40 ++--- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 8 +- kernel/arm/dgemm_tcopy_4_vfp.S | 60 ++++---- kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 26 ++-- kernel/arm/gemv_n_vfp.S | 100 ++++++------- kernel/arm/gemv_n_vfpv3.S | 120 +++++++-------- kernel/arm/gemv_t_vfp.S | 168 ++++++++++----------- kernel/arm/gemv_t_vfpv3.S | 168 ++++++++++----------- kernel/arm/iamax_vfp.S | 32 ++-- kernel/arm/nrm2_vfp.S | 16 +- kernel/arm/nrm2_vfpv3.S | 16 +- kernel/arm/rot_vfp.S | 224 ++++++++++++++-------------- kernel/arm/scal_vfp.S | 76 +++++----- kernel/arm/scopy_vfp.S | 32 ++-- kernel/arm/sdot_vfp.S | 72 ++++----- kernel/arm/sgemm_kernel_4x2_vfp.S | 4 +- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 40 ++--- kernel/arm/sgemm_tcopy_4_vfp.S | 70 ++++----- kernel/arm/strmm_kernel_4x2_vfp.S | 4 +- kernel/arm/strmm_kernel_4x4_vfpv3.S | 34 ++--- kernel/arm/swap_vfp.S | 112 +++++++------- kernel/arm/zcopy_vfp.S | 28 ++-- kernel/arm/zdot_vfp.S | 40 ++--- kernel/arm/zgemm_kernel_2x2_vfp.S | 24 +-- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 24 +-- kernel/arm/zgemm_tcopy_2_vfp.S | 20 +-- kernel/arm/zgemv_n_vfp.S | 32 ++-- kernel/arm/zgemv_t_vfp.S | 40 ++--- 40 files changed, 1105 insertions(+), 1105 deletions(-) diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index 5b08e5028..9a75885a2 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vadd.f64 d1 , d1, d7 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 vadd.f32 s0 , s0, s6 vadd.f32 s1 , s1, s7 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index c35b8aece..39c9ac233 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } fmacd d9 , d0, d5 - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d9 } fmacd d10, d0, d6 - fstmiad Y!, { d10 } + vstmia.f64 Y!, { d10 } fmacd d11, d0, d7 - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d11 } .endm @@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y , { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } .endm .macro KERNEL_S1 - fldmiad X , { d4 } - fldmiad Y , { d8 } + vldmia.f64 X , { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y , { d8 } + vstmia.f64 Y , { d8 } add X, X, INC_X add Y, Y, INC_Y @@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } fmacs s9 , s0, s5 - fstmias Y!, { s9 } + vstmia.f32 Y!, { s9 } fmacs s10, s0, s6 - fstmias Y!, { s10 } + vstmia.f32 Y!, { s10 } fmacs s11, s0, s7 - fstmias Y!, { s11 } + vstmia.f32 Y!, { s11 } .endm @@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y , { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } .endm .macro KERNEL_S1 - fldmias X , { s4 } - fldmias Y , { s8 } + vldmia.f32 X , { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y , { s8 } + vstmia.f32 Y , { s8 } add X, X, INC_X add Y, Y, INC_Y @@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } @@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } @@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X , { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X , { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y , { d8 - d9 } + vstmia.f64 Y , { d8 - d9 } add X, X, INC_X add Y, Y, INC_Y @@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s7 } + vldmia.f32 X!, { s4 - s7 } pld [ Y, #X_PRE ] - fldmias Y , { s8 - s11 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } @@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } @@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X , { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X , { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y , { s8 - s9 } + vstmia.f32 Y , { s8 - s9 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S index 874fcab9c..fbb32b43c 100644 --- a/kernel/arm/ccopy_vfp.S +++ b/kernel/arm/ccopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmias X!, { s0 - s7 } - fstmias Y!, { s0 - s7 } + vldmia.f32 X!, { s0 - s7 } + vstmia.f32 Y!, { s0 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 - s1 } - fstmias Y!, { s0 - s1 } + vldmia.f32 X!, { s0 - s1 } + vstmia.f32 Y!, { s0 - s1 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index fd86a37b0..85246d734 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 fmacs s3 , s7, s10 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index 71bc50efd..d2591919e 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } - fldmias CO2, { s4 - s7 } + vldmia.f32 CO2, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } - fldmias CO2, { s4 - s5 } + vldmia.f32 CO2, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 9d473ad78..5ebc904ac 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } - fldmias CO2, { s8 - s11 } + vldmia.f32 CO1, { s4 - s7 } + vldmia.f32 CO2, { s8 - s11 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } - fldmias CO2, { s8 - s9 } + vldmia.f32 CO1, { s4 - s5 } + vldmia.f32 CO2, { s8 - s9 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_tcopy_2_vfp.S b/kernel/arm/cgemm_tcopy_2_vfp.S index 9036b994d..7b3ae18d4 100644 --- a/kernel/arm/cgemm_tcopy_2_vfp.S +++ b/kernel/arm/cgemm_tcopy_2_vfp.S @@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ .macro COPY2x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 -s1 } + vldmia.f32 AO1, { s0 -s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 62ee33bb9..d6b18c796 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, #8 @@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index c07b6d6f8..6833df7d1 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -150,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -180,7 +180,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -204,8 +204,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -216,14 +216,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -249,9 +249,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -269,25 +269,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -313,8 +313,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -327,14 +327,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index aae890ea9..ca1a512fb 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -513,7 +513,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -523,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -693,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -818,7 +818,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 79e7ed07f..d75fb7735 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -170,30 +170,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -206,17 +206,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -238,19 +238,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -354,8 +354,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -532,8 +532,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -710,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -835,7 +835,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S index da239924a..7ee52af88 100644 --- a/kernel/arm/dcopy_vfp.S +++ b/kernel/arm/dcopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmiad X!, { d0 - d3 } - fstmiad Y!, { d0 - d3 } + vldmia.f64 X!, { d0 - d3 } + vstmia.f64 Y!, { d0 - d3 } .endm .macro COPY_F1 - fldmiad X!, { d0 } - fstmiad Y!, { d0 } + vldmia.f64 X!, { d0 } + vstmia.f64 Y!, { d0 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index cc2e485b7..4dff5a3e1 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -67,26 +67,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d8 } + vldmia.f64 X!, { d8 } pld [ Y, #X_PRE ] - fldmiad Y!, { d4 } - fldmiad Y!, { d5 } + vldmia.f64 Y!, { d4 } + vldmia.f64 Y!, { d5 } fmacd d0 , d4, d8 - fldmiad X!, { d9 } - fldmiad Y!, { d6 } + vldmia.f64 X!, { d9 } + vldmia.f64 Y!, { d6 } fmacd d1 , d5, d9 - fldmiad X!, { d10 } - fldmiad X!, { d11 } + vldmia.f64 X!, { d10 } + vldmia.f64 X!, { d11 } fmacd d0 , d6, d10 - fldmiad Y!, { d7 } + vldmia.f64 Y!, { d7 } fmacd d1 , d7, d11 .endm .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y!, { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y!, { d8 } fmacd d0 , d4, d8 .endm @@ -97,26 +97,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d4, d8 - fldmiad X, { d5 } - fldmiad Y, { d9 } + vldmia.f64 X, { d5 } + vldmia.f64 Y, { d9 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d5, d9 - fldmiad X, { d6 } - fldmiad Y, { d10 } + vldmia.f64 X, { d6 } + vldmia.f64 Y, { d10 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d6, d10 - fldmiad X, { d7 } - fldmiad Y, { d11 } + vldmia.f64 X, { d7 } + vldmia.f64 Y, { d11 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d7, d11 @@ -126,8 +126,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X fmacd d0 , d4, d8 add Y, Y, INC_Y diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 1744b54d8..d852c2dad 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r4 , CO2, r3 pld [ CO2 , #C_PRE ] - fldmiad CO1, { d8 - d11 } + vldmia.f64 CO1, { d8 - d11 } pld [ r4 , #C_PRE ] fmacd d8 , d0 , d16 @@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d15, d0 , d23 fstd d11, [CO1, #24 ] - fldmiad r4, { d8 - d11 } + vldmia.f64 r4, { d8 - d11 } fmacd d8 , d0 , d24 fstd d12, [CO2] @@ -367,7 +367,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO2 , #C_PRE ] - fldmiad CO2, { d12 - d15 } + vldmia.f64 CO2, { d12 - d15 } fstd d8 , [r4 ] fmacd d12, d0 , d28 @@ -378,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmacd d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S index 937f43957..8335de27c 100644 --- a/kernel/arm/dgemm_tcopy_4_vfp.S +++ b/kernel/arm/dgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d8 - d11 } + vldmia.f64 r3, { d8 - d11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d12 - d15 } + vldmia.f64 r3, { d12 - d15 } - fstmiad BO1, { d0 - d15 } + vstmia.f64 BO1, { d0 - d15 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } add r3, r3, LDA - fldmiad r3, { d4 - d5 } + vldmia.f64 r3, { d4 - d5 } add r3, r3, LDA - fldmiad r3, { d6 - d7 } + vldmia.f64 r3, { d6 - d7 } - fstmiad BO2, { d0 - d7 } + vstmia.f64 BO2, { d0 - d7 } add AO1, AO1, #16 add BO2, BO2, #64 @@ -117,18 +117,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } add r3, r3, LDA - fldmiad r3, { d2 } + vldmia.f64 r3, { d2 } add r3, r3, LDA - fldmiad r3, { d3 } + vldmia.f64 r3, { d3 } - fstmiad BO3, { d0 - d3 } + vstmia.f64 BO3, { d0 - d3 } add AO1, AO1, #8 add BO3, BO3, #32 @@ -139,13 +139,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -153,12 +153,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -166,12 +166,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } - fstmiad BO3, { d0 - d1 } + vstmia.f64 BO3, { d0 - d1 } add AO1, AO1, #8 add BO3, BO3, #16 @@ -182,9 +182,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -192,9 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 @@ -202,9 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } - fstmiad BO3, { d0 } + vstmia.f64 BO3, { d0 } add AO1, AO1, #8 add BO3, BO3, #8 diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c0c6a1677..e73936cdd 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -128,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmuld d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmuld d17 , d1, d8 fldd d9 , [ BO, #8 ] fmuld d18 , d2, d8 @@ -148,10 +148,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuld d23 , d3, d9 fmuld d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmuld d25 , d1, d10 fmuld d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -173,10 +173,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmacd d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmacd d17 , d1, d8 fldd d9 , [ BO, #8 ] fmacd d18 , d2, d8 @@ -193,10 +193,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d23 , d3, d9 fmacd d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d25 , d1, d10 fmacd d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -225,11 +225,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] fmacd d21 , d5, d13 fmacd d22 , d6, d13 - fldmiad AO!, { d0 - d1 } + vldmia.f64 AO!, { d0 - d1 } fmacd d23 , d7, d13 fmacd d24 , d4, d14 - fldmiad AO!, { d2 - d3 } + vldmia.f64 AO!, { d2 - d3 } fmacd d25 , d5, d14 fldd d9 , [ BO, #8 ] fmacd d26 , d6, d14 @@ -257,10 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d19 , d3, d8 fmacd d20 , d0, d9 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d21 , d1, d9 fmacd d22 , d2, d9 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d23 , d3, d9 fmacd d24 , d0, d10 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmuld d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 7c154d741..753ac27c6 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -139,8 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmiad XO! , { d2 } - fldmiad AO1 , { d4 - d7 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d4 - d7 } vmla.f64 d8 , d2 , d4 pld [ AO2 , #4*SIZE ] @@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d11 , d2 , d7 - fldmiad r3 , { d4 - d7 } + vldmia.f64 r3 , { d4 - d7 } vmla.f64 d12 , d2 , d4 vmla.f64 d13 , d2 , d5 @@ -164,23 +164,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d8 vmla.f64 d5 , d0, d9 vmla.f64 d6 , d0, d10 vmla.f64 d7 , d0, d11 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d12 vmla.f64 d5 , d0, d13 vmla.f64 d6 , d0, d14 vmla.f64 d7 , d0, d15 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -195,8 +195,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -204,9 +204,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 pld [ AO2 , #A_PRE ] - fldmiad XO , { d2 } - fldmiad AO1 , { d8 - d11 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 - d11 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -249,24 +249,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d12 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d13 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d14 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d15 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA add XO, XO , INC_X @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO , { d4 } + vstmia.f64 YO , { d4 } add YO, YO, INC_Y .endm @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2, #A_PRE ] - fldmias XO! , { s2 } - fldmias AO1 , { s4 - s7 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s4 - s7 } vmla.f32 s8 , s2 , s4 vmla.f32 s9 , s2 , s5 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r3, AO1, #4*SIZE - fldmias r3 , { s4 - s7 } + vldmia.f32 r3 , { s4 - s7 } vmla.f32 s12 , s2 , s4 vmla.f32 s13 , s2 , s5 @@ -362,24 +362,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s8 vmla.f32 s5 , s0, s9 vmla.f32 s6 , s0, s10 vmla.f32 s7 , s0, s11 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s12 vmla.f32 s5 , s0, s13 vmla.f32 s6 , s0, s14 vmla.f32 s7 , s0, s15 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -394,8 +394,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA @@ -403,9 +403,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 - s11 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 - s11 } vmla.f32 s12 , s2 , s8 vmla.f32 s13 , s2 , s9 @@ -449,24 +449,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s12 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s13 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s14 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s15 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -482,8 +482,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA add XO, XO , INC_X @@ -492,9 +492,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO , { s4 } + vstmia.f32 YO , { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 54f958b7b..e80dc1458 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -138,8 +138,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 pld [ AO2 , #A_PRE ] @@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d16 - d23 } + vldmia.f64 YO, { d16 - d23 } vmla.f64 d16, d0, d24 vmla.f64 d17, d0, d25 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d22, d0, d30 vmla.f64 d23, d0, d31 - fstmiad YO!, { d16 - d23 } + vstmia.f64 YO!, { d16 - d23 } .endm @@ -184,8 +184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA @@ -193,9 +193,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO!, { d16 } + vstmia.f64 YO!, { d16 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO2 , #A_PRE ] pld [ AO2 , #A_PRE+32 ] - fldmiad XO , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 vmla.f64 d25 , d4 , d9 @@ -253,44 +253,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y - fldmiad YO, { d17 } + vldmia.f64 YO, { d17 } vmla.f64 d17, d0, d25 - fstmiad YO, { d17 } + vstmia.f64 YO, { d17 } add YO, YO, INC_Y - fldmiad YO, { d18 } + vldmia.f64 YO, { d18 } vmla.f64 d18, d0, d26 - fstmiad YO, { d18 } + vstmia.f64 YO, { d18 } add YO, YO, INC_Y - fldmiad YO, { d19 } + vldmia.f64 YO, { d19 } vmla.f64 d19, d0, d27 - fstmiad YO, { d19 } + vstmia.f64 YO, { d19 } add YO, YO, INC_Y - fldmiad YO, { d20 } + vldmia.f64 YO, { d20 } vmla.f64 d20, d0, d28 - fstmiad YO, { d20 } + vstmia.f64 YO, { d20 } add YO, YO, INC_Y - fldmiad YO, { d21 } + vldmia.f64 YO, { d21 } vmla.f64 d21, d0, d29 - fstmiad YO, { d21 } + vstmia.f64 YO, { d21 } add YO, YO, INC_Y - fldmiad YO, { d22 } + vldmia.f64 YO, { d22 } vmla.f64 d22, d0, d30 - fstmiad YO, { d22 } + vstmia.f64 YO, { d22 } add YO, YO, INC_Y - fldmiad YO, { d23 } + vldmia.f64 YO, { d23 } vmla.f64 d23, d0, d31 - fstmiad YO, { d23 } + vstmia.f64 YO, { d23 } add YO, YO, INC_Y .endm @@ -306,8 +306,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA add XO, XO, INC_X @@ -316,9 +316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y .endm @@ -361,8 +361,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmias XO! , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -379,7 +379,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s16 - s23 } + vldmia.f32 YO, { s16 - s23 } vmla.f32 s16, s0, s24 vmla.f32 s17, s0, s25 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f32 s22, s0, s30 vmla.f32 s23, s0, s31 - fstmias YO!, { s16 - s23 } + vstmia.f32 YO!, { s16 - s23 } .endm @@ -405,8 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA @@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO!, { s16 } + vstmia.f32 YO!, { s16 } .endm @@ -454,8 +454,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S8X1 pld [ AO2 , #A_PRE ] - fldmias XO , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -473,44 +473,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y - fldmias YO, { s17 } + vldmia.f32 YO, { s17 } vmla.f32 s17, s0, s25 - fstmias YO, { s17 } + vstmia.f32 YO, { s17 } add YO, YO, INC_Y - fldmias YO, { s18 } + vldmia.f32 YO, { s18 } vmla.f32 s18, s0, s26 - fstmias YO, { s18 } + vstmia.f32 YO, { s18 } add YO, YO, INC_Y - fldmias YO, { s19 } + vldmia.f32 YO, { s19 } vmla.f32 s19, s0, s27 - fstmias YO, { s19 } + vstmia.f32 YO, { s19 } add YO, YO, INC_Y - fldmias YO, { s20 } + vldmia.f32 YO, { s20 } vmla.f32 s20, s0, s28 - fstmias YO, { s20 } + vstmia.f32 YO, { s20 } add YO, YO, INC_Y - fldmias YO, { s21 } + vldmia.f32 YO, { s21 } vmla.f32 s21, s0, s29 - fstmias YO, { s21 } + vstmia.f32 YO, { s21 } add YO, YO, INC_Y - fldmias YO, { s22 } + vldmia.f32 YO, { s22 } vmla.f32 s22, s0, s30 - fstmias YO, { s22 } + vstmia.f32 YO, { s22 } add YO, YO, INC_Y - fldmias YO, { s23 } + vldmia.f32 YO, { s23 } vmla.f32 s23, s0, s31 - fstmias YO, { s23 } + vstmia.f32 YO, { s23 } add YO, YO, INC_Y .endm @@ -526,8 +526,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA add XO, XO, INC_X @@ -536,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 9559d1829..fbe51cc8c 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -112,13 +112,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO2!, { d4 - d5 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } vmla.f64 d2 , d12 , d8 vmla.f64 d3 , d12 , d4 @@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 vmla.f64 d3 , d1 , d4 @@ -143,10 +143,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } vmla.f64 d4, d0, d2 vmla.f64 d5, d0, d3 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -160,10 +160,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d2 , d12 , d8 vmla.f64 d2 , d13 , d9 vmla.f64 d2 , d14, d10 @@ -173,17 +173,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -197,23 +197,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } + vldmia.f64 AO2!, { d4 - d5 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -229,9 +229,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X vmla.f64 d3 , d1 , d4 @@ -240,14 +240,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5, d0, d3 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -261,20 +261,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -286,8 +286,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X @@ -295,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y .endm @@ -315,11 +315,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } vmla.f32 s2 , s12 , s8 vmla.f32 s3 , s12 , s4 @@ -334,9 +334,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 vmla.f32 s3 , s1 , s4 @@ -344,10 +344,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } vmla.f32 s4, s0, s2 vmla.f32 s5, s0, s3 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -359,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO1!, { s10 - s11 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s2 , s12 , s8 vmla.f32 s2 , s13 , s9 vmla.f32 s2 , s14, s10 @@ -371,17 +371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 .endm .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -395,21 +395,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -425,9 +425,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X vmla.f32 s3 , s1 , s4 @@ -436,14 +436,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5, s0, s3 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -456,20 +456,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -481,8 +481,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X @@ -490,9 +490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index b1d3dadf1..a88d70016 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -108,17 +108,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -129,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d16 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -139,10 +139,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d24 - d25 } + vldmia.f64 YO, { d24 - d25 } vmla.f64 d24, d0, d4 vmla.f64 d25, d0, d5 - fstmiad YO!, { d24 - d25 } + vstmia.f64 YO!, { d24 - d25 } .endm @@ -156,23 +156,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -183,10 +183,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X - fldmiad AO2!, { d16 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -194,14 +194,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d5 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d4 , d30, d10 vmla.f64 d4 , d31, d11 @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } vmla.f64 d4 , d2 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO!, { d24 } + vstmia.f64 YO!, { d24 } .endm @@ -252,18 +252,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d4 , d31, d11 @@ -272,8 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X vmla.f64 d4 , d2 , d8 @@ -281,9 +281,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -300,15 +300,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -319,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } - fldmias AO2!, { s16 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -329,10 +329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s24 - s25 } + vldmia.f32 YO, { s24 - s25 } vmla.f32 s24, s0, s4 vmla.f32 s25, s0, s5 - fstmias YO!, { s24 - s25 } + vstmia.f32 YO!, { s24 - s25 } .endm @@ -345,22 +345,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -371,10 +371,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X - fldmias AO2!, { s16 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -382,14 +382,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s5 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm @@ -402,10 +402,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s4 , s30, s10 vmla.f32 s4 , s31, s11 @@ -415,17 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } vmla.f32 s4 , s2 , s8 .endm .macro SAVE_F1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO!, { s24 } + vstmia.f32 YO!, { s24 } .endm @@ -437,18 +437,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s4 , s31, s11 @@ -457,8 +457,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X vmla.f32 s4 , s2 , s8 @@ -466,9 +466,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index fab05c9c8..fd43b15b1 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 } + vldmia.f64 X!, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 } + vldmia.f64 X, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -146,7 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 } + vldmia.f32 X!, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -182,7 +182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 } + vldmia.f32 X, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -193,7 +193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -215,7 +215,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 -d1 } + vldmia.f64 X!, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -241,7 +241,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 -d1 } + vldmia.f64 X, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -272,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 -s1 } + vldmia.f32 X!, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -284,7 +284,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 -s1 } + vldmia.f32 X, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -312,7 +312,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index 16ac5a632..8e0937851 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 84977901d..7be1e977e 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index ea296dbc5..6aec06205 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -77,68 +77,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X, { d2 } - fstmiad Y, { d3 } + vstmia.f64 X, { d2 } + vstmia.f64 Y, { d3 } add X, X, INC_X add Y, Y, INC_Y @@ -149,68 +149,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X, { s2 } - fstmias Y, { s3 } + vstmia.f32 X, { s2 } + vstmia.f32 Y, { s3 } add X, X, INC_X add Y, Y, INC_Y @@ -230,96 +230,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 @@ -347,96 +347,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index cc3e3b98d..8992c35a8 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -64,30 +64,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X, { d4 - d7 } + vldmia.f64 X, { d4 - d7 } vmul.f64 d4, d4, d0 vmul.f64 d5, d5, d0 vmul.f64 d6, d6, d0 - fstmiad X!, { d4 - d5 } + vstmia.f64 X!, { d4 - d5 } vmul.f64 d7, d7, d0 - fstmiad X!, { d6 - d7 } + vstmia.f64 X!, { d6 - d7 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X!, { d4 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X, { d4 } + vstmia.f64 X, { d4 } add X, X, INC_X .endm @@ -96,30 +96,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 - s7 } + vldmia.f32 X, { s4 - s7 } vmul.f32 s4, s4, s0 vmul.f32 s5, s5, s0 vmul.f32 s6, s6, s0 - fstmias X!, { s4 - s5 } + vstmia.f32 X!, { s4 - s5 } vmul.f32 s7, s7, s0 - fstmias X!, { s6 - s7 } + vstmia.f32 X!, { s6 - s7 } .endm .macro KERNEL_F1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X!, { s4 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X, { s4 } + vstmia.f32 X, { s4 } add X, X, INC_X .endm @@ -136,58 +136,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X, { d2 - d3 } + vstmia.f64 X, { d2 - d3 } add X, X, INC_X .endm @@ -199,56 +199,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X, { s2 - s3 } + vstmia.f32 X, { s2 - s3 } add X, X, INC_X .endm diff --git a/kernel/arm/scopy_vfp.S b/kernel/arm/scopy_vfp.S index 0fd815db8..1ccd29c95 100644 --- a/kernel/arm/scopy_vfp.S +++ b/kernel/arm/scopy_vfp.S @@ -65,17 +65,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F8 pld [ X, #X_PRE ] - fldmias X!, { s0 - s3 } - fldmias X!, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias Y!, { s4 - s7 } + vldmia.f32 X!, { s0 - s3 } + vldmia.f32 X!, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 Y!, { s4 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 } - fstmias Y!, { s0 } + vldmia.f32 X!, { s0 } + vstmia.f32 Y!, { s0 } .endm @@ -85,23 +85,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y @@ -110,8 +110,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index 544846258..bb374b5ee 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -68,26 +68,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -96,8 +96,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -109,32 +109,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -146,8 +146,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -162,12 +162,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s8 - s9 } - fldmias Y!, { s4 - s5} + vldmia.f32 X!, { s8 - s9 } + vldmia.f32 Y!, { s4 - s5} fmacs s0 , s4, s8 - fldmias X!, { s10 - s11 } + vldmia.f32 X!, { s10 - s11 } fmacs s1 , s5, s9 - fldmias Y!, { s6 - s7 } + vldmia.f32 Y!, { s6 - s7 } fmacs s0 , s6, s10 fmacs s1 , s7, s11 @@ -175,8 +175,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y!, { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y!, { s8 } fmacs s0 , s4, s8 .endm @@ -185,26 +185,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s4, s8 - fldmias X, { s5 } - fldmias Y, { s9 } + vldmia.f32 X, { s5 } + vldmia.f32 Y, { s9 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s5, s9 - fldmias X, { s6 } - fldmias Y, { s10 } + vldmia.f32 X, { s6 } + vldmia.f32 Y, { s10 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s6, s10 - fldmias X, { s7 } - fldmias Y, { s11 } + vldmia.f32 X, { s7 } + vldmia.f32 Y, { s11 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s7, s11 @@ -214,8 +214,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X fmacs s0 , s4, s8 add Y, Y, INC_Y diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index 1f21e5a1f..c072f4126 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -112,8 +112,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO! , { s0 - s3 } - fldmias BO! , { s4 - s5 } + vldmia.f32 AO! , { s0 - s3 } + vldmia.f32 BO! , { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 6491d3571..789643f56 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -136,29 +136,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I pld [ AO , #A_PRE ] - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ BO , #B_PRE ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -174,20 +174,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias BO!, { s8 - s11 } + vldmia.f32 BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - //fldmias AO!, { s2 - s3 } + //vldmia.f32 AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - //fldmias BO!, { s10 - s11 } + //vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -203,17 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s7 } + vldmia.f32 AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias BO!, { s12 - s15 } - //fldmias AO!, { s6 - s7 } + vldmia.f32 BO!, { s12 - s15 } + //vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 fmacs s21 , s1, s9 fmacs s22 , s2, s9 - //fldmias BO!, { s14 - s15 } + //vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -300,7 +300,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA add r4 , CO2, r3 - fldmias CO1, { s8 - s11 } + vldmia.f32 CO1, { s8 - s11 } fmacs s8 , s0 , s16 flds s12, [CO2] @@ -322,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO1 , #C_PRE ] - fldmias r4, { s8 - s11 } + vldmia.f32 r4, { s8 - s11 } fmacs s8 , s0 , s24 fsts s12, [CO2] @@ -338,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add CO2, r4 , r3 - fldmias CO2, { s12 - s15 } + vldmia.f32 CO2, { s12 - s15 } fsts s8 , [r4 ] fmacs s12, s0 , s28 @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s15, s0 , s31 pld [ r4 , #C_PRE ] - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } pld [ CO2 , #C_PRE ] add CO1, CO1, #16 diff --git a/kernel/arm/sgemm_tcopy_4_vfp.S b/kernel/arm/sgemm_tcopy_4_vfp.S index 9bb0e46b1..e61613c5c 100644 --- a/kernel/arm/sgemm_tcopy_4_vfp.S +++ b/kernel/arm/sgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_1 pld [ AO1, #A_PRE ] - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -118,18 +118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } add r3, r3, LDA - fldmias r3, { s4 - s5 } + vldmia.f32 r3, { s4 - s5 } add r3, r3, LDA - fldmias r3, { s6 - s7 } + vldmia.f32 r3, { s6 - s7 } - fstmias BO2, { s0 - s7 } + vstmia.f32 BO2, { s0 - s7 } add AO1, AO1, #8 add BO2, BO2, #32 @@ -137,18 +137,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } add r3, r3, LDA - fldmias r3, { s2 } + vldmia.f32 r3, { s2 } add r3, r3, LDA - fldmias r3, { s3 } + vldmia.f32 r3, { s3 } - fstmias BO3, { s0 - s3 } + vstmia.f32 BO3, { s0 - s3 } add AO1, AO1, #4 add BO3, BO3, #16 @@ -158,12 +158,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -171,12 +171,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -184,12 +184,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } - fstmias BO3, { s0 - s1 } + vstmia.f32 BO3, { s0 - s1 } add AO1, AO1, #4 add BO3, BO3, #8 @@ -199,9 +199,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -209,9 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 @@ -219,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } - fstmias BO3, { s0 } + vstmia.f32 BO3, { s0 } add AO1, AO1, #4 add BO3, BO3, #4 diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 635b1dd13..34fa0ee39 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -118,8 +118,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s5 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index e24d24eba..0f601d5b8 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -122,30 +122,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ AO , #A_PRE-8 ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } pld [ BO , #B_PRE-8 ] fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -161,20 +161,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -190,17 +190,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s21 , s1, s9 fmacs s22 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -325,7 +325,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [r4 , #12 ] fmuls s15, s0 , s31 - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } add CO1, CO1, #16 diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 76661da79..0b3d98912 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -103,29 +103,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y!, { d0 } - fstmiad X!, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y!, { d0 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y, { d0 } - fstmiad X, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y, { d0 } + vstmia.f64 X, { d4 } add X, X, INC_X add Y, Y, INC_Y @@ -135,29 +135,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y!, { s0 } - fstmias X!, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y!, { s0 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y, { s0 } - fstmias X, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y, { s0 } + vstmia.f32 X, { s4 } add X, X, INC_X add Y, Y, INC_Y @@ -174,35 +174,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y!, { d0 - d1 } - fstmiad X!, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y!, { d0 - d1 } + vstmia.f64 X!, { d4 - d5 } .endm .macro KERNEL_S1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y, { d0 - d1 } - fstmiad X, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y, { d0 - d1 } + vstmia.f64 X, { d4 - d5 } add X, X, INC_X add Y, Y, INC_Y @@ -215,33 +215,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y!, { s0 - s1 } - fstmias X!, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y!, { s0 - s1 } + vstmia.f32 X!, { s4 - s5 } .endm .macro KERNEL_S1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y, { s0 - s1 } - fstmias X, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y, { s0 - s1 } + vstmia.f32 X, { s4 - s5 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S index 48aee4ce0..899dd1e36 100644 --- a/kernel/arm/zcopy_vfp.S +++ b/kernel/arm/zcopy_vfp.S @@ -66,15 +66,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ X, #X_PRE+32 ] - fldmiad X!, { d0 - d7 } - fstmiad Y!, { d0 - d7 } + vldmia.f64 X!, { d0 - d7 } + vstmia.f64 Y!, { d0 - d7 } .endm .macro COPY_F1 - fldmiad X!, { d0 - d1 } - fstmiad Y!, { d0 - d1 } + vldmia.f64 X!, { d0 - d1 } + vstmia.f64 Y!, { d0 - d1 } .endm @@ -84,23 +84,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index c0cd92d3c..5ef9f16a9 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -76,15 +76,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 pld [ X, #X_PRE ] @@ -93,15 +93,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 fmacd d2 , d7, d11 @@ -111,8 +111,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -127,8 +127,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -136,8 +136,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -145,8 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -168,8 +168,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 53d18b07b..7934a500e 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -360,7 +360,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -372,9 +372,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } - fldmiad CO2, { d4 - d7 } + vldmia.f64 CO2, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -386,7 +386,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -543,23 +543,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } - fldmiad CO2, { d4 - d5 } + vldmia.f64 CO2, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -714,7 +714,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -726,7 +726,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -843,14 +843,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index a9d4eddeb..cbb10f342 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -374,8 +374,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } - fldmiad CO2, { d8 - d11 } + vldmia.f64 CO1, { d4 - d7 } + vldmia.f64 CO2, { d8 - d11 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -406,8 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -570,8 +570,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } - fldmiad CO2, { d8 - d9 } + vldmia.f64 CO1, { d4 - d5 } + vldmia.f64 CO2, { d8 - d9 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -588,8 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -752,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -769,7 +769,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -887,7 +887,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -897,7 +897,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_tcopy_2_vfp.S b/kernel/arm/zgemm_tcopy_2_vfp.S index 7e27ca6a6..5e1a384b1 100644 --- a/kernel/arm/zgemm_tcopy_2_vfp.S +++ b/kernel/arm/zgemm_tcopy_2_vfp.S @@ -74,13 +74,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -88,12 +88,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 -d1 } + vldmia.f64 AO1, { d0 -d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -102,9 +102,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -112,9 +112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index 3e3a1bc07..4e64d8785 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -204,7 +204,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -216,9 +216,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -230,7 +230,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -269,14 +269,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, #16 @@ -352,47 +352,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 2193083af..c66fa4fb8 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -151,12 +151,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 - fldmiad AO2!, { d8 - d9 } + vldmia.f64 AO2!, { d8 - d9 } KMAC_R d12 , d5 , d3 KMAC_I d13 , d5 , d2 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -205,8 +205,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -217,14 +217,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -250,9 +250,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } - fldmiad AO2!, { d8 - d9 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } + vldmia.f64 AO2!, { d8 - d9 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -270,25 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -314,8 +314,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -328,14 +328,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y From 6fc85a63596bd1fe85f066f35c358b5815d38fe1 Mon Sep 17 00:00:00 2001 From: fengruilin Date: Wed, 26 Sep 2018 15:14:04 +0800 Subject: [PATCH 210/935] test_axpy work error on LOONGSON3A platform #1777 --- kernel/mips64/axpy_loongson3a.S | 14 ++++++++++++++ kernel/mips64/daxpy_loongson3a_simd.S | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/kernel/mips64/axpy_loongson3a.S b/kernel/mips64/axpy_loongson3a.S index 5904bc580..765e5ebbb 100644 --- a/kernel/mips64/axpy_loongson3a.S +++ b/kernel/mips64/axpy_loongson3a.S @@ -270,6 +270,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .L20: + beqz INCY, .L27 dsra I, N, 3 move YY, Y @@ -450,5 +451,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) + +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index f54008bc2..23225770a 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -562,6 +562,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //INCX!=1 or INCY != 1 .L20: + beq INCY, $0, .L27 dsra I, N, 3 move YY, Y @@ -754,5 +755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) + +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE From 9b2a7ad40d22e08f7d3a2e1443aa3f8a10c7b77f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 Sep 2018 23:05:15 +0200 Subject: [PATCH 211/935] Convert fldmia/fstmia instructions to UAL syntax for clang7 second part of fix for #1774, containing files missed in #1775 --- kernel/arm/cgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/dgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/dgemm_ncopy_4_vfp.S | 16 ++++++++-------- kernel/arm/sgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/sgemm_ncopy_4_vfp.S | 16 ++++++++-------- kernel/arm/zgemm_ncopy_2_vfp.S | 8 ++++---- kernel/arm/ztrmm_kernel_2x2_vfp.S | 12 ++++++------ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 12 ++++++------ 8 files changed, 44 insertions(+), 44 deletions(-) diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S index 29eeab492..fe4959988 100644 --- a/kernel/arm/cgemm_ncopy_2_vfp.S +++ b/kernel/arm/cgemm_ncopy_2_vfp.S @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s6 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S index 6266c61d2..9642b6478 100644 --- a/kernel/arm/dgemm_ncopy_2_vfp.S +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S index ffc19a9cc..5760cbd8a 100644 --- a/kernel/arm/dgemm_ncopy_4_vfp.S +++ b/kernel/arm/dgemm_ncopy_4_vfp.S @@ -105,10 +105,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d11, [ AO4, #16 ] fldd d15, [ AO4, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #32 - fstmiad BO!, { d4 - d7 } - fstmiad BO!, { d8 - d15 } + vstmia.f64 BO!, { d4 - d7 } + vstmia.f64 BO!, { d8 - d15 } .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO4, #0 ] add AO3, AO3, #8 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #8 .endm @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d5 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -164,7 +164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -174,7 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S index ff4ff0845..dd4596602 100644 --- a/kernel/arm/sgemm_ncopy_2_vfp.S +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/sgemm_ncopy_4_vfp.S b/kernel/arm/sgemm_ncopy_4_vfp.S index ab013134e..dbcea5961 100644 --- a/kernel/arm/sgemm_ncopy_4_vfp.S +++ b/kernel/arm/sgemm_ncopy_4_vfp.S @@ -100,10 +100,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s11, [ AO4, #8 ] flds s15, [ AO4, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #16 - fstmias BO!, { s4 - s7 } - fstmias BO!, { s8 - s15 } + vstmia.f32 BO!, { s4 - s7 } + vstmia.f32 BO!, { s8 - s15 } .endm @@ -117,7 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO4, #0 ] add AO3, AO3, #4 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #4 .endm @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s5 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -147,7 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S index b3fa225bb..d0661da2a 100644 --- a/kernel/arm/zgemm_ncopy_2_vfp.S +++ b/kernel/arm/zgemm_ncopy_2_vfp.S @@ -87,7 +87,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d6 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -101,7 +101,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -113,7 +113,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -124,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index cb6bc050e..4393bc9f6 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -385,7 +385,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -402,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -567,7 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -577,7 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 3e6962f06..39b12caa0 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -391,8 +391,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -569,8 +569,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 From 45fe8cb0c5d06f890913e86078cb48ac379c65dc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 3 Oct 2018 14:45:25 +0000 Subject: [PATCH 212/935] Create a AVX512 enabled version of DGEMM This patch adds dgemm_kernel_4x8_skylakex.c which is * dgemm_kernel_4x8_haswell.s converted to C + intrinsics * 8x8 support added * 8x8 kernel implemented using AVX512 Performance is a work in progress, but already shows a 10% - 20% increase for a wide range of matrix sizes. --- kernel/x86_64/KERNEL.SKYLAKEX | 16 +- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 1288 +++++++++++++++++++++ 2 files changed, 1293 insertions(+), 11 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_4x8_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 1256f4c3c..ba149512d 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -2,18 +2,12 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -#DTRMMKERNEL = ../generic/trmmkernel_16x2.c -#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S -#DGEMMINCOPY = ../generic/gemm_ncopy_16.c -#DGEMMITCOPY = ../generic/gemm_tcopy_16.c -#DGEMMONCOPY = ../generic/gemm_ncopy_2.c -#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c new file mode 100644 index 000000000..4162611ff --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -0,0 +1,1288 @@ +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/* + * This file is based on dgemm_kernel_4x8_haswell.s (original copyright above). + * The content got translated from ASM to C+intrinsics, significantly simplified, + * and AVX512 support added by Arjan van de Ven + */ + + +#include "common.h" +#include + + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + + +/******************************************************************************************/ + + +#define INIT4x8() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + ymm8 = _mm256_setzero_pd(); \ + ymm9 = _mm256_setzero_pd(); \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + + +#define KERNEL4x8_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ +/* ymm0 [ A B C D ] */ \ + ymm1 = _mm256_loadu_pd(BO - 12); \ + ymm2 = _mm256_loadu_pd(BO - 8); \ +/* ymm1 [ 1 2 3 4 ] */ \ +/* ymm2 [ 5 6 7 8 ] */ \ + \ + ymm4 += ymm0 * ymm1; \ +/* ymm4 += [ A*1 | B*2 | C*3 | D*4 ] */ \ + ymm8 += ymm0 * ymm2; \ +/* ymm8 += [ A*5 | B*6 | C*7 | D*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ B A D C ] */ \ + ymm5 += ymm0 * ymm1; \ +/* ymm5 += [ B*1 | A*2 | D*3 | C*4 ] */ \ + ymm9 += ymm0 * ymm2; \ +/* ymm9 += [ B*5 | A*6 | D*7 | C*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ +/* ymm0 [ C D A B ]] */ \ + ymm6 += ymm0 * ymm1; \ +/* ymm6 += [ C*1 | D*2 | A*3 | B*4 ] */ \ + ymm10+= ymm0 * ymm2; \ +/* ymm10 += [ C*5 | D*6 | A*7 | B*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ D C B A ] */ \ + ymm7 += ymm0 * ymm1; \ +/* ymm7 += [ D*1 | C*2 | B*3 | A*4 ] */ \ + ymm11+= ymm0 * ymm2; \ +/* ymm11 += [ D*5 | C*6 | B*7 | A*8 ] */ \ + AO += 4; \ + BO += 8; + + +#define SAVE4x8(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + ymm8 *= ymm0; \ + ymm9 *= ymm0; \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + \ +/* Entry values: */ \ +/* ymm4 = a [ A*1 | B*2 | C*3 | D*4 ] */ \ +/* ymm5 = a [ B*1 | A*2 | D*3 | C*4 ] */ \ +/* ymm6 = a [ C*1 | D*2 | A*3 | B*4 ] */ \ +/* ymm7 = a [ D*1 | C*2 | B*3 | A*4 ] */ \ +/* ymm8 = a [ A*5 | B*6 | C*7 | D*8 ] */ \ +/* ymm9 = a [ B*5 | A*6 | D*7 | C*8 ] */ \ +/* ymm10 = a [ C*5 | D*6 | A*7 | B*8 ] */ \ +/* ymm11 = a [ D*5 | C*6 | B*7 | A*8 ] */ \ + \ + ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ +/* ymm5 = a [ A*2 | B*1 | C*4 | D*3 ] */ \ + ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ +/* ymm7 = a [ C*2 | D*1 | A*4 | B*3 ] */ \ + \ + ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ +/* ymm0 = a [ A*1 | B*1 | C*3 | D*3 ] */ \ +/* ymm1 = a [ A*2 | B*2 | C*4 | D*4 ] */ \ + ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ +/* ymm2 = a [ C*1 | D*1 | A*3 | B*3 ] */ \ +/* ymm3 = a [ C*2 | D*2 | A*4 | B*4 ] */ \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ +/* ymm2 = a [ B*3 | A*3 | D*1 | C*1 ] */ \ +/* ymm3 = a [ B*4 | A*4 | D*2 | C*2 ] */ \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ +/* ymm2 = a [ A*3 | B*3 | C*1 | D*1 ] */ \ +/* ymm3 = a [ A*4 | B*4 | C*2 | D*2 ] */ \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ +/* ymm4 = a [ A*1 | B*1 | C*1 | D*1 ] */ \ +/* ymm5 = a [ A*2 | B*2 | C*2 | D*2 ] */ \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ +/* ymm5 = a [ A*3 | B*3 | C*3 | D*3 ] */ \ +/* ymm7 = a [ A*4 | B*4 | C*4 | D*4 ] */ \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + ymm9 = _mm256_permute4x64_pd(ymm9, 0xb1); \ + ymm11 = _mm256_permute4x64_pd(ymm11, 0xb1); \ + \ + ymm0 = _mm256_blend_pd(ymm8, ymm9, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm8, ymm9, 0x05); \ + ymm2 = _mm256_blend_pd(ymm10, ymm11, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm10, ymm11, 0x05); \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (4 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (5 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (6 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (7 * ldc)); \ + _mm256_storeu_pd(CO1 + (4 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (5 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (6 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (7 * ldc), ymm7); \ + \ + CO1 += 4; + +/******************************************************************************************/ + +#define INIT2x8() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + xmm8 = _mm_setzero_pd(); \ + xmm9 = _mm_setzero_pd(); \ + xmm10 = _mm_setzero_pd(); \ + xmm11 = _mm_setzero_pd(); \ + + +#define KERNEL2x8_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 8)); \ + xmm6 += xmm0 * xmm3; \ + xmm3 = _mm_set1_pd(*(BO - 7)); \ + xmm7 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 6)); \ + xmm8 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 5)); \ + xmm9 += xmm0 * xmm3; \ + xmm10 += xmm0 * xmm1; \ + xmm11 += xmm0 * xmm2; \ + BO += 8; \ + AO += 2; + +#define SAVE2x8(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + xmm8 *= xmm0; \ + xmm9 *= xmm0; \ + xmm10 *= xmm0; \ + xmm11 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + xmm8 += _mm_loadu_pd(CO1 + (4 * ldc)); \ + xmm9 += _mm_loadu_pd(CO1 + (5 * ldc)); \ + xmm10+= _mm_loadu_pd(CO1 + (6 * ldc)); \ + xmm11+= _mm_loadu_pd(CO1 + (7 * ldc)); \ + _mm_storeu_pd(CO1 + (4 * ldc), xmm8); \ + _mm_storeu_pd(CO1 + (5 * ldc), xmm9); \ + _mm_storeu_pd(CO1 + (6 * ldc), xmm10); \ + _mm_storeu_pd(CO1 + (7 * ldc), xmm11); \ + CO1 += 2; + + + + +/******************************************************************************************/ + +#define INIT1x8() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + dbl8 = 0; \ + dbl9 = 0; \ + dbl10 = 0; \ + dbl11 = 0; + + +#define KERNEL1x8_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl4 += dbl0 * dbl1; \ + dbl1 = *(BO - 9); \ + dbl5 += dbl0 * dbl2; \ + dbl2 = *(BO - 8); \ + dbl6 += dbl0 * dbl3; \ + dbl3 = *(BO - 7); \ + dbl7 += dbl0 * dbl1; \ + dbl1 = *(BO - 6); \ + dbl8 += dbl0 * dbl2; \ + dbl2 = *(BO - 5); \ + dbl9 += dbl0 * dbl3; \ + dbl10 += dbl0 * dbl1; \ + dbl11 += dbl0 * dbl2; \ + BO += 8; \ + AO += 1; + + +#define SAVE1x8(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + dbl8 *= dbl0; \ + dbl9 *= dbl0; \ + dbl10 *= dbl0; \ + dbl11 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + dbl8 += *(CO1 + (4 * ldc)); \ + dbl9 += *(CO1 + (5 * ldc)); \ + dbl10 += *(CO1 + (6 * ldc)); \ + dbl11 += *(CO1 + (7 * ldc)); \ + *(CO1 + (4 * ldc)) = dbl8; \ + *(CO1 + (5 * ldc)) = dbl9; \ + *(CO1 + (6 * ldc)) = dbl10; \ + *(CO1 + (7 * ldc)) = dbl11; \ + \ + CO1 += 1; + + + + + + +/******************************************************************************************/ + +#define INIT4x4() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL4x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(BO - 12); \ + \ + ymm4 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm5 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ + ymm6 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm7 += ymm0 * ymm1; \ + AO += 4; \ + BO += 4; + + +#define SAVE4x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ + ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ + \ + ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ + ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x4() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + + +#define KERNEL2x4_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm0 * xmm1; \ + BO += 4; \ + AO += 2; + + + +#define SAVE2x4(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + CO1 += 2; + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x4() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + +#define KERNEL1x4_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl8 = *(BO - 9); \ + \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + dbl6 += dbl0 * dbl3; \ + dbl7 += dbl0 * dbl8; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + \ + CO1 += 1; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT8x4() \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + ymm12 = _mm256_setzero_pd(); \ + ymm13 = _mm256_setzero_pd(); \ + ymm14 = _mm256_setzero_pd(); \ + ymm15 = _mm256_setzero_pd(); \ + ymm16 = _mm256_setzero_pd(); \ + ymm17 = _mm256_setzero_pd(); \ + + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 = _mm256_set1_pd(*(BO - 10)); \ + ymm5 = _mm256_set1_pd(*(BO - 9)); \ + ymm10 += ymm0 * ymm2; \ + ymm11 += ymm1 * ymm2; \ + ymm12 += ymm0 * ymm3; \ + ymm13 += ymm1 * ymm3; \ + ymm14 += ymm0 * ymm4; \ + ymm15 += ymm1 * ymm4; \ + ymm16 += ymm0 * ymm5; \ + ymm17 += ymm1 * ymm5; \ + BO += 4; \ + AO += 8; + + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + ymm12 *= ymm0; \ + ymm13 *= ymm0; \ + ymm14 *= ymm0; \ + ymm15 *= ymm0; \ + ymm16 *= ymm0; \ + ymm17 *= ymm0; \ + \ + ymm10 += _mm256_loadu_pd(CO1); \ + ymm11 += _mm256_loadu_pd(CO1 + 4); \ + ymm12 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm13 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + ymm14 += _mm256_loadu_pd(CO1 + (ldc*2)); \ + ymm15 += _mm256_loadu_pd(CO1 + (ldc*2) + 4); \ + ymm16 += _mm256_loadu_pd(CO1 + (ldc*3)); \ + ymm17 += _mm256_loadu_pd(CO1 + (ldc*3) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm10); \ + _mm256_storeu_pd(CO1 + 4, ymm11); \ + _mm256_storeu_pd(CO1 + ldc, ymm12); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm13); \ + _mm256_storeu_pd(CO1 + ldc*2, ymm14); \ + _mm256_storeu_pd(CO1 + ldc*2 + 4, ymm15); \ + _mm256_storeu_pd(CO1 + ldc*3, ymm16); \ + _mm256_storeu_pd(CO1 + ldc*3 + 4, ymm17); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT8x2() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 += ymm0 * ymm2; \ + ymm5 += ymm1 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm7 += ymm1 * ymm3; \ + BO += 2; \ + AO += 8; + + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + ymm5 += _mm256_loadu_pd(CO1 + 4); \ + ymm6 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm4); \ + _mm256_storeu_pd(CO1 + 4, ymm5); \ + _mm256_storeu_pd(CO1 + ldc, ymm6); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm7); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT4x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_loadu_pd(AO - 14); \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm5 += xmm1 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm1 * xmm3; \ + BO += 2; \ + AO += 4; + + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm5 += _mm_loadu_pd(CO1 + 2); \ + xmm6 += _mm_loadu_pd(CO1 + (ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (ldc) + 2); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + 2, xmm5); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + _mm_storeu_pd(CO1 + ldc + 2, xmm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + + + +#define KERNEL2x2_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + BO += 2; \ + AO += 2; + + +#define SAVE2x2(ALPHA) \ + if (ALPHA != 1.0) { \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm6 *= xmm0; \ + } \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm6 += _mm_loadu_pd(CO1 + ldc); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x2() \ + dbl4 = 0; \ + dbl5 = 0; + + +#define KERNEL1x2_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + \ + \ + CO1 += 1; + + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT4x1() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); + + +#define KERNEL4x1() \ + ymm0 = _mm256_set1_pd(*(BO - 12)); \ + ymm1 = _mm256_set1_pd(*(BO - 11)); \ + ymm2 = _mm256_set1_pd(*(BO - 10)); \ + ymm3 = _mm256_set1_pd(*(BO - 9)); \ + \ + ymm4 += _mm256_loadu_pd(AO - 16) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO - 12) * ymm1; \ + \ + ymm0 = _mm256_set1_pd(*(BO - 8)); \ + ymm1 = _mm256_set1_pd(*(BO - 7)); \ + \ + ymm6 += _mm256_loadu_pd(AO - 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO - 4) * ymm3; \ + \ + ymm2 = _mm256_set1_pd(*(BO - 6)); \ + ymm3 = _mm256_set1_pd(*(BO - 5)); \ + \ + ymm4 += _mm256_loadu_pd(AO + 0) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO + 4) * ymm1; \ + ymm6 += _mm256_loadu_pd(AO + 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO + 12) * ymm3; \ + \ + BO += 8; \ + AO += 32; + + +#define INIT8x1() \ + zmm4 = _mm512_setzero_pd(); \ + + +#define KERNEL8x1_SUB() \ + zmm2 = _mm512_set1_pd(*(BO - 12)); \ + zmm0 = _mm512_loadu_pd(AO - 16); \ + zmm4 += zmm0 * zmm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + zmm0 = _mm512_set1_pd(ALPHA); \ + zmm4 *= zmm0; \ + \ + zmm4 += _mm512_loadu_pd(CO1); \ + _mm512_storeu_pd(CO1, zmm4); \ + CO1 += 8; + +#define KERNEL4x1_SUB() \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 += ymm5; \ + ymm6 += ymm7; \ + ymm4 += ymm6; \ + ymm4 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + _mm256_storeu_pd(CO1, ymm4); \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x1() \ + xmm4 = _mm_setzero_pd(); + + +#define KERNEL2x1_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm4 += xmm0 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x1() \ + dbl4 = 0; + +#define KERNEL1x1_SUB() \ + dbl1 = *(BO - 12); \ + dbl0 = *(AO - 16); \ + dbl4 += dbl0 * dbl1; \ + BO += 1; \ + AO += 1; + +#define SAVE1x1(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl4 += *CO1; \ + *CO1 = dbl4; \ + CO1 += 1; + + +/*******************************************************************************************/ + +/* START */ + + +int __attribute__ ((noinline)) +dgemm_kernel(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) +{ + unsigned long M=m, N=n, K=k; + + + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + while (N >= 8) { + double *CO1; + double *AO; + int i; + + CO1 = C; + C += 8 * ldc; + + AO = A + 16; + + i = m; + + while (i >= 8) { + double *BO; + int kloop = K; + + BO = B + 12; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vbroadcastsd (%[alpha]), %%zmm9\n" + "jmp .label1\n" + ".align 32\n" + /* Inner math loop */ + ".label1:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vfmadd231pd -96(%[BO])%{1to8%}, %%zmm0, %%zmm1\n" + "vfmadd231pd -88(%[BO])%{1to8%}, %%zmm0, %%zmm2\n" + "vfmadd231pd -80(%[BO])%{1to8%}, %%zmm0, %%zmm3\n" + "vfmadd231pd -72(%[BO])%{1to8%}, %%zmm0, %%zmm4\n" + "vfmadd231pd -64(%[BO])%{1to8%}, %%zmm0, %%zmm5\n" + "vfmadd231pd -56(%[BO])%{1to8%}, %%zmm0, %%zmm6\n" + "vfmadd231pd -48(%[BO])%{1to8%}, %%zmm0, %%zmm7\n" + "vfmadd231pd -40(%[BO])%{1to8%}, %%zmm0, %%zmm8\n" + "add $64, %[AO]\n" + "add $64, %[BO]\n" + "subl $1, %[kloop]\n" + "jg .label1\n" + /* multiply the result by alpha */ + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + "prefetchw 64(%[C0])\n" + "prefetchw 64(%[C1])\n" + "prefetchw 64(%[C2])\n" + "prefetchw 64(%[C3])\n" + "prefetchw 64(%[C4])\n" + "prefetchw 64(%[C5])\n" + "prefetchw 64(%[C6])\n" + "prefetchw 64(%[C7])\n" + : + [AO] "+r" (AO), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9" + ); + CO1 += 8; + i-= 8; + } + + + + while (i >= 4) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11; + int kloop = K; + + BO = B + 12; + INIT4x8() + + while (kloop > 0) { + KERNEL4x8_SUB() + kloop--; + } + SAVE4x8(alpha) + i-= 4; + } + + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + int kloop = K; + + BO = B + 12; + INIT2x8() + + while (kloop > 0) { + KERNEL2x8_SUB() + kloop--; + } + SAVE2x8(alpha) + i -= 2; + } + + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8, dbl9, dbl10, dbl11; + int kloop = K; + + BO = B + 12; + INIT1x8() + + while (kloop > 0) { + KERNEL1x8_SUB() + kloop--; + } + SAVE1x8(alpha) + i -= 1; + } + B += K * 8; + N -= 8; + } + + if (N == 0) + return 0; + + + + // L8_0 + while (N >= 4) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + // L8_11 + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm10, ymm11,ymm12,ymm13,ymm14,ymm15,ymm16,ymm17; + BO = B + 12; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + + i -= 8; + } + while (i >= 4) { + // L8_11 + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + BO = B + 12; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + BO = B; + BO += 12; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8; + int kloop = K; + BO = B + 12; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + + i-=8; + } + + while (i >= 4) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm3, xmm4, xmm6; + int kloop = K; + BO = B + 12; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl4, dbl5; + int kloop = K; + BO = B + 12; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + double *CO1; + double *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m512d zmm0, zmm2, zmm4; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + + i-= 8; + } + while (i >= 4) { + double *BO; + __m256d ymm0, ymm2, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm4; + int kloop = K; + BO = B; + BO += 12; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl4; + int kloop = K; + + BO = B; + BO += 12; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} From 3439158dea277d132b3804c245cba1f09b4329dd Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 3 Oct 2018 21:20:50 +0200 Subject: [PATCH 213/935] address #1782 2nd loop --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 0019253c0..4a8e6c067 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2587,20 +2587,20 @@ void *blas_memory_alloc(int procpos){ position = 0; + LOCK_COMMAND(&alloc_lock); do { /* if (!memory[position].used) { */ - LOCK_COMMAND(&alloc_lock); /* blas_lock(&memory[position].lock);*/ if (!memory[position].used) goto allocation; - UNLOCK_COMMAND(&alloc_lock); /* blas_unlock(&memory[position].lock);*/ /* } */ position ++; } while (position < NUM_BUFFERS); + UNLOCK_COMMAND(&alloc_lock); goto error; From 591cca7cb05486320230ff8f09255a8d300c20ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Oct 2018 07:35:30 +0200 Subject: [PATCH 214/935] Check availability of immintrin.h in the AVX512 compatibility test --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 66acf1cad..9dc237beb 100644 --- a/c_check +++ b/c_check @@ -205,7 +205,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "int main(void){ __asm__ volatile($code); }\n"; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; From 4c3643ed7f50f13df5efe637f05ffbc705e1860a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Oct 2018 07:36:49 +0200 Subject: [PATCH 215/935] Check availability of immintrin.h in the AVX512 compatibility test --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d339a755f..fe30c7600 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -67,7 +67,7 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") From b095f2fad651d3134b5760d25acda686f3a831b7 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Thu, 4 Oct 2018 12:27:44 +0200 Subject: [PATCH 216/935] Fix unknown type name __WAIT_STATUS on RHEL5 With glibc 2.5 one must have #define _XOPEN_SOURCE >= 500 to use wait. But reading glibc code this is actually needed only if stdlib.h was included before sys/wait.h. This was the case here through openblas_utest.h. So changing include fix compilation on RHEL5 and should ne hurt with more recent distro. * Problem found when using with gcc 5.5 and 4.7.2 on RHEL5/CENTOS5 * Fix #1519 --- utest/test_fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/test_fork.c b/utest/test_fork.c index 9fc51287c..0b90407b1 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -#include "openblas_utest.h" #include #include #include +#include "openblas_utest.h" void* xmalloc(size_t n) { From b7496c36384a681428e60993c2cd7c721ca4dfe5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Oct 2018 19:14:59 +0200 Subject: [PATCH 217/935] Function name needs to be CNAME, set from outside to allow suffixing for dynamic_arch --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 4162611ff..8d0205c5a 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int __attribute__ ((noinline)) -dgemm_kernel(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) { unsigned long M=m, N=n, K=k; From c3e0f0eb3865c372b112a2449fc04d84a1f36515 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Fri, 5 Oct 2018 15:41:52 +0300 Subject: [PATCH 218/935] update travis alpine chroot with avx512 intrinsics headers --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4a25e7121..6e27a6fe4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -85,8 +85,8 @@ jobs: sudo: true language: minimal before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ - && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" + - "wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' From bda3dbe2eb8fb837330d9b5f501ad1eaed81d437 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Fri, 5 Oct 2018 15:47:55 +0300 Subject: [PATCH 219/935] update travis alpine chroot with avx512 intrinsics headers --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6e27a6fe4..a0af0472e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -85,7 +85,7 @@ jobs: sudo: true language: minimal before_install: - - "wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install \ + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: From 1938819c25d7dd4ba995900797f5123e4cfd6fa4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 5 Oct 2018 11:49:43 +0000 Subject: [PATCH 220/935] skylake dgemm: Add a 16x8 kernel The next step for the avx512 dgemm code is adding a 16x8 kernel. In the 8x8 kernel, each FMA has a matching load (the broadcast); in the 16x8 kernel we can reuse this load for 2 FMAs, which in turn reduces pressure on the load ports of the CPU and gives a nice performance boost (in the 25% range). --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 155 +++++++++++++++++++++- 1 file changed, 154 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 8d0205c5a..09d48f99a 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -849,11 +849,13 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, i = m; - while (i >= 8) { + while (i >= 16) { double *BO; + double *A1; int kloop = K; BO = B + 12; + A1 = AO + 8 * K; /* * This is the inner loop for the hot hot path * Written in inline asm because compilers like GCC 8 and earlier @@ -861,6 +863,157 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, * the AVX512 built in broadcast ability (1to8) */ asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "jmp .label16\n" + ".align 32\n" + /* Inner math loop */ + ".label16:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label16\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + "vmulpd %%zmm9, %%zmm11, %%zmm11\n" + "vmulpd %%zmm9, %%zmm12, %%zmm12\n" + "vmulpd %%zmm9, %%zmm13, %%zmm13\n" + "vmulpd %%zmm9, %%zmm14, %%zmm14\n" + "vmulpd %%zmm9, %%zmm15, %%zmm15\n" + "vmulpd %%zmm9, %%zmm16, %%zmm16\n" + "vmulpd %%zmm9, %%zmm17, %%zmm17\n" + "vmulpd %%zmm9, %%zmm18, %%zmm18\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" + "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" + "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" + "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" + "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" + "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" + "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" + "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18" + ); + CO1 += 16; + AO += 8 * K; + i-= 16; + } + + while (i >= 8) { + double *BO; + int kloop = K; + + BO = B + 12; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( "vxorpd %%zmm1, %%zmm1, %%zmm1\n" "vmovapd %%zmm1, %%zmm2\n" "vmovapd %%zmm1, %%zmm3\n" From 66b43affbc24a69e841930d18c30758542aa381c Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 5 Oct 2018 13:22:21 +0000 Subject: [PATCH 221/935] Add a 24x8 kernel to the skylakex dgemm implementation Minor gains for small matrixes, but at 512x512 and above the gain gets more significant. --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 201 ++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 09d48f99a..293bd4a99 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -849,6 +849,207 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, i = m; + while (i >= 24) { + double *BO; + double *A1, *A2; + int kloop = K; + + BO = B + 12; + A1 = AO + 8 * K; + A2 = AO + 16 * K; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "vmovapd %%zmm1, %%zmm21\n" + "vmovapd %%zmm1, %%zmm22\n" + "vmovapd %%zmm1, %%zmm23\n" + "vmovapd %%zmm1, %%zmm24\n" + "vmovapd %%zmm1, %%zmm25\n" + "vmovapd %%zmm1, %%zmm26\n" + "vmovapd %%zmm1, %%zmm27\n" + "vmovapd %%zmm1, %%zmm28\n" + "jmp .label24\n" + ".align 32\n" + /* Inner math loop */ + ".label24:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + "vmovupd -128(%[A2]),%%zmm20\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm21\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm22\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm23\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm24\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm25\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm26\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm27\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm28\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[A2]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[A2])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label24\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + "vmulpd %%zmm9, %%zmm11, %%zmm11\n" + "vmulpd %%zmm9, %%zmm12, %%zmm12\n" + "vmulpd %%zmm9, %%zmm13, %%zmm13\n" + "vmulpd %%zmm9, %%zmm14, %%zmm14\n" + "vmulpd %%zmm9, %%zmm15, %%zmm15\n" + "vmulpd %%zmm9, %%zmm16, %%zmm16\n" + "vmulpd %%zmm9, %%zmm17, %%zmm17\n" + "vmulpd %%zmm9, %%zmm18, %%zmm18\n" + "vmulpd %%zmm9, %%zmm21, %%zmm21\n" + "vmulpd %%zmm9, %%zmm22, %%zmm22\n" + "vmulpd %%zmm9, %%zmm23, %%zmm23\n" + "vmulpd %%zmm9, %%zmm24, %%zmm24\n" + "vmulpd %%zmm9, %%zmm25, %%zmm25\n" + "vmulpd %%zmm9, %%zmm26, %%zmm26\n" + "vmulpd %%zmm9, %%zmm27, %%zmm27\n" + "vmulpd %%zmm9, %%zmm28, %%zmm28\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" + "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" + "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" + "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" + "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" + "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" + "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" + "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + "vaddpd 128(%[C0]), %%zmm21, %%zmm21\n" + "vaddpd 128(%[C1]), %%zmm22, %%zmm22\n" + "vaddpd 128(%[C2]), %%zmm23, %%zmm23\n" + "vaddpd 128(%[C3]), %%zmm24, %%zmm24\n" + "vaddpd 128(%[C4]), %%zmm25, %%zmm25\n" + "vaddpd 128(%[C5]), %%zmm26, %%zmm26\n" + "vaddpd 128(%[C6]), %%zmm27, %%zmm27\n" + "vaddpd 128(%[C7]), %%zmm28, %%zmm28\n" + "vmovupd %%zmm21, 128(%[C0])\n" + "vmovupd %%zmm22, 128(%[C1])\n" + "vmovupd %%zmm23, 128(%[C2])\n" + "vmovupd %%zmm24, 128(%[C3])\n" + "vmovupd %%zmm25, 128(%[C4])\n" + "vmovupd %%zmm26, 128(%[C5])\n" + "vmovupd %%zmm27, 128(%[C6])\n" + "vmovupd %%zmm28, 128(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [A2] "+r" (A2), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28" + ); + CO1 += 24; + AO += 16 * K; + i-= 24; + } + + while (i >= 16) { double *BO; double *A1; From 79ea839b635d1fd84b6ce8a47e086f01d64198e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiziano=20M=C3=BCller?= Date: Sat, 6 Oct 2018 14:10:02 +0200 Subject: [PATCH 222/935] fix parallel build issues with APFS/HFS+/ext2/3 in netlib-lapack The problem is that OpenBLAS sets the LAPACKE_LIB and the TMGLIB to the same object and uses the `ar` feature to update the archive file. If the underlying filesystem does not have sub-second timestamp resolution and the system is fast enough (or `ccache` is used), the timestamp of the builds which should be added to the previously generated archive is the same as the archive file itself and therefore `make` does not update the archive. Since OpenBLAS takes care to not run the different targets updating the archive in parallel, the easiest solution is to declare the respective targets `.PHONY`, forcing `make` to always update them. fixes #1682 --- lapack-netlib/LAPACKE/src/Makefile | 2 ++ lapack-netlib/SRC/Makefile | 2 ++ lapack-netlib/TESTING/MATGEN/Makefile | 2 ++ 3 files changed, 6 insertions(+) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 44884d4a5..7672f9f73 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -2454,6 +2454,8 @@ endif all: ../../$(LAPACKELIB) +.PHONY: ../../$(LAPACKELIB) + ../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN) $(ARCH) $(ARCHFLAGS) $@ $(OBJ_A) $(ARCH) $(ARCHFLAGS) $@ $(OBJ_B) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 531cb51fc..87a8f51e4 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -552,6 +552,8 @@ endif all: ../$(LAPACKLIB) +.PHONY: ../$(LAPACKLIB) + ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) $@ diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index e20004c2f..a1d784fa5 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -57,6 +57,8 @@ all: ../../$(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) +.PHONY: ../../$(TMGLIB) + ../../$(TMGLIB): $(ALLOBJ) $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ From 474f7e9583a85630345458abb71b7246def3f10f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Oct 2018 14:28:04 +0200 Subject: [PATCH 223/935] Add SYMBOLPREFIX and -SUFFIX options and improve help output --- CMakeLists.txt | 114 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 96 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97c3b7777..ca951d401 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,16 +15,21 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(OpenBLAS_LIBNAME openblas) - ####### if(MSVC) -option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) +option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() -option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) -option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) -option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) -option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) +option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) +option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) +option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) + +# Add a prefix or suffix to all exported symbol names in the shared library. +# Avoids conflicts with other BLAS libraries, especially when using +# 64 bit integer interfaces in OpenBLAS. + +set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) +set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) ####### if(BUILD_WITHOUT_LAPACK) set(NO_LAPACK 1) @@ -38,11 +43,13 @@ endif() ####### -message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") +message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") +set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) + set(BLASDIRS interface driver/level2 driver/level3 driver/others) if (NOT DYNAMIC_ARCH) @@ -210,15 +217,84 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES SOVERSION ${OpenBLAS_MAJOR_VERSION} ) +if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") +if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") +else() + set(ARCH_IN ${ARCH}) +endif() + +if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") +endif () + +if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) +else() + set(EXPRECISION_IN ${EXPRECISION}) +endif() + +if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) +else() + set(NO_CBLAS_IN ${NO_CBLAS}) +endif() + +if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) +else() + set(NO_LAPACK_IN ${NO_LAPACK}) +endif() + +if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) +else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) +endif() + +if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) +else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) +endif() + +if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) +else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) +endif() + +if (NOT DEFINED BU) + set(BU _) +endif() + +if (NOT ${SYMBOLPREFIX} STREQUAL "") +message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") +endif() +if (NOT ${SYMBOLSUFFIX} STREQUAL "") +message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") +endif() + add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMENT "renaming symbols" + ) +endif() + + # Install project # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} - EXPORT "OpenBLASTargets" + EXPORT "OpenBLAS${SUFFIX64}Targets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +# Install headers +set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) +set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) + message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) @@ -266,29 +342,31 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() include(FindPkgConfig QUIET) if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) + configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) + install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(PN OpenBLAS) -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") configure_package_config_file(cmake/${PN}Config.cmake.in - "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake VERSION ${${PN}_VERSION} COMPATIBILITY AnyNewerVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + RENAME ${PN}${SUFFIX64}ConfigVersion.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) -install(EXPORT "${PN}Targets" - NAMESPACE "${PN}::" +install(EXPORT "${PN}${SUFFIX64}Targets" + NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) From d74dc39b0faeebb7aeb97e4099dcb50a1fcc7533 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 13:47:20 +0000 Subject: [PATCH 224/935] Add optimized *copy versions for skylakex Add optimized n/t copy versions for skylakex; in the patch the tcopy is also rewritten using intrinsics; the ncopy file will be worked on in a future commit --- kernel/x86_64/KERNEL.SKYLAKEX | 8 +- kernel/x86_64/dgemm_ncopy_8_skylakex.c | 422 +++++++++++++++++++++++++ kernel/x86_64/dgemm_tcopy_8_skylakex.c | 417 ++++++++++++++++++++++++ 3 files changed, 843 insertions(+), 4 deletions(-) create mode 100644 kernel/x86_64/dgemm_ncopy_8_skylakex.c create mode 100644 kernel/x86_64/dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index ba149512d..e34cda770 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -4,10 +4,10 @@ SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_8.c -DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c +DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c new file mode 100644 index 000000000..3bc55b8cc --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -0,0 +1,422 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + *(boffset + 8) = ctemp02; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp18; + *(boffset + 11) = ctemp26; + *(boffset + 12) = ctemp34; + *(boffset + 13) = ctemp42; + *(boffset + 14) = ctemp50; + *(boffset + 15) = ctemp58; + + *(boffset + 16) = ctemp03; + *(boffset + 17) = ctemp11; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp27; + *(boffset + 20) = ctemp35; + *(boffset + 21) = ctemp43; + *(boffset + 22) = ctemp51; + *(boffset + 23) = ctemp59; + + *(boffset + 24) = ctemp04; + *(boffset + 25) = ctemp12; + *(boffset + 26) = ctemp20; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp36; + *(boffset + 29) = ctemp44; + *(boffset + 30) = ctemp52; + *(boffset + 31) = ctemp60; + + *(boffset + 32) = ctemp05; + *(boffset + 33) = ctemp13; + *(boffset + 34) = ctemp21; + *(boffset + 35) = ctemp29; + *(boffset + 36) = ctemp37; + *(boffset + 37) = ctemp45; + *(boffset + 38) = ctemp53; + *(boffset + 39) = ctemp61; + + *(boffset + 40) = ctemp06; + *(boffset + 41) = ctemp14; + *(boffset + 42) = ctemp22; + *(boffset + 43) = ctemp30; + *(boffset + 44) = ctemp38; + *(boffset + 45) = ctemp46; + *(boffset + 46) = ctemp54; + *(boffset + 47) = ctemp62; + + *(boffset + 48) = ctemp07; + *(boffset + 49) = ctemp15; + *(boffset + 50) = ctemp23; + *(boffset + 51) = ctemp31; + *(boffset + 52) = ctemp39; + *(boffset + 53) = ctemp47; + *(boffset + 54) = ctemp55; + *(boffset + 55) = ctemp63; + + *(boffset + 56) = ctemp08; + *(boffset + 57) = ctemp16; + *(boffset + 58) = ctemp24; + *(boffset + 59) = ctemp32; + *(boffset + 60) = ctemp40; + *(boffset + 61) = ctemp48; + *(boffset + 62) = ctemp56; + *(boffset + 63) = ctemp64; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp13; + + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp10; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp07; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp15; + + *(boffset + 12) = ctemp04; + *(boffset + 13) = ctemp08; + *(boffset + 14) = ctemp12; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/x86_64/dgemm_tcopy_8_skylakex.c b/kernel/x86_64/dgemm_tcopy_8_skylakex.c new file mode 100644 index 000000000..472ad6349 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_8_skylakex.c @@ -0,0 +1,417 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + row3 = _mm512_loadu_pd(aoffset3); + aoffset3 += 8; + row4 = _mm512_loadu_pd(aoffset4); + aoffset4 += 8; + row5 = _mm512_loadu_pd(aoffset5); + aoffset5 += 8; + row6 = _mm512_loadu_pd(aoffset6); + aoffset6 += 8; + row7 = _mm512_loadu_pd(aoffset7); + aoffset7 += 8; + row8 = _mm512_loadu_pd(aoffset8); + aoffset8 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + _mm512_storeu_pd(boffset1 + 16, row3); + _mm512_storeu_pd(boffset1 + 24, row4); + _mm512_storeu_pd(boffset1 + 32, row5); + _mm512_storeu_pd(boffset1 + 40, row6); + _mm512_storeu_pd(boffset1 + 48, row7); + _mm512_storeu_pd(boffset1 + 56, row8); + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + row3 = _mm256_loadu_pd(aoffset3); + aoffset3 += 4; + row4 = _mm256_loadu_pd(aoffset4); + aoffset4 += 4; + row5 = _mm256_loadu_pd(aoffset5); + aoffset5 += 4; + row6 = _mm256_loadu_pd(aoffset6); + aoffset6 += 4; + row7 = _mm256_loadu_pd(aoffset7); + aoffset7 += 4; + row8 = _mm256_loadu_pd(aoffset8); + aoffset8 += 4; + + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + _mm256_storeu_pd(boffset2 + 8, row3); + _mm256_storeu_pd(boffset2 + 12, row4); + _mm256_storeu_pd(boffset2 + 16, row5); + _mm256_storeu_pd(boffset2 + 20, row6); + _mm256_storeu_pd(boffset2 + 24, row7); + _mm256_storeu_pd(boffset2 + 28, row8); + boffset2 += 32; + } + + if (n & 2){ + __m128d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + row3 = _mm_loadu_pd(aoffset3); + aoffset3 += 2; + + row4 = _mm_loadu_pd(aoffset4); + aoffset4 += 2; + + row5 = _mm_loadu_pd(aoffset5); + aoffset5 += 2; + + row6 = _mm_loadu_pd(aoffset6); + aoffset6 += 2; + + row7 = _mm_loadu_pd(aoffset7); + aoffset7 += 2; + + row8 = _mm_loadu_pd(aoffset8); + aoffset8 += 2; + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + _mm_storeu_pd(boffset3 + 4, row3); + _mm_storeu_pd(boffset3 + 6, row4); + _mm_storeu_pd(boffset3 + 8, row5); + _mm_storeu_pd(boffset3 + 10, row6); + _mm_storeu_pd(boffset3 + 12, row7); + _mm_storeu_pd(boffset3 + 14, row8); + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + *(boffset4 + 4) = ctemp05; + *(boffset4 + 5) = ctemp06; + *(boffset4 + 6) = ctemp07; + *(boffset4 + 7) = ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + __m512d row1, row2, row3, row4; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + row3 = _mm512_loadu_pd(aoffset3); + aoffset3 += 8; + row4 = _mm512_loadu_pd(aoffset4); + aoffset4 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + _mm512_storeu_pd(boffset1 + 16, row3); + _mm512_storeu_pd(boffset1 + 24, row4); + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + __m256d row1, row2, row3, row4; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + row3 = _mm256_loadu_pd(aoffset3); + aoffset3 += 4; + row4 = _mm256_loadu_pd(aoffset4); + aoffset4 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + _mm256_storeu_pd(boffset2 + 8, row3); + _mm256_storeu_pd(boffset2 + 12, row4); + boffset2 += 16; + } + + if (n & 2){ + __m128d row1, row2, row3, row4; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + row3 = _mm_loadu_pd(aoffset3); + aoffset3 += 2; + + row4 = _mm_loadu_pd(aoffset4); + aoffset4 += 2; + + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + _mm_storeu_pd(boffset3 + 4, row3); + _mm_storeu_pd(boffset3 + 6, row4); + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1, row2; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1, row2; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + boffset2 += 8; + } + + if (n & 2){ + __m128d row1, row2; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + // aoffset += lda; + + boffset1 = boffset; + // boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + // boffset2 += 4; + } + + if (n & 2){ + __m128d row1; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + _mm_storeu_pd(boffset3 + 0, row1); + + // boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = ctemp01; + boffset4 ++; + } + } + + return 0; +} From 6d43c51ccf7de3d0f41c2e2b382ada07159cf599 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 14:00:37 +0000 Subject: [PATCH 225/935] undo slow dgemm/skylake microoptimization the compare is more costly than the work --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index 293bd4a99..b5693ea2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -647,11 +647,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SAVE2x2(ALPHA) \ - if (ALPHA != 1.0) { \ - xmm0 = _mm_set1_pd(ALPHA); \ - xmm4 *= xmm0; \ - xmm6 *= xmm0; \ - } \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm6 *= xmm0; \ \ xmm4 += _mm_loadu_pd(CO1); \ xmm6 += _mm_loadu_pd(CO1 + ldc); \ From 20c5d668fe316d6f431a34f8734600194644e736 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 14:12:32 +0000 Subject: [PATCH 226/935] dgemm/avx512 simplify and speed up the 4x4 kernel --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 26 ++++------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index b5693ea2c..bb121ca69 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -333,17 +333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define KERNEL4x4_SUB() \ ymm0 = _mm256_loadu_pd(AO - 16); \ - ymm1 = _mm256_loadu_pd(BO - 12); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 12)); \ \ ymm4 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 11)); \ ymm5 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 10)); \ ymm6 += ymm0 * ymm1; \ \ - ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 9)); \ ymm7 += ymm0 * ymm1; \ AO += 4; \ BO += 4; @@ -356,24 +356,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ymm6 *= ymm0; \ ymm7 *= ymm0; \ \ - ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ - ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ - \ - ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ - ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ - ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ - ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ - \ - ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ - ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ - ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ - ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ - \ - ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ - ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ - ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ - ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ - \ ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ From 32bec8afbbdb94df4e5a4b127fa8aa5857fccc54 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 16:36:26 +0000 Subject: [PATCH 227/935] add a skylakex optimized dgemm beta function --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- kernel/x86_64/dgemm_beta_skylakex.c | 150 ++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_beta_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index e34cda770..48c81e80b 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,4 +10,4 @@ DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c SGEMM_BETA = ../generic/gemm_beta.c -DGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c new file mode 100644 index 000000000..384e9f60b --- /dev/null +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + /* fast path.. just zero the whole matrix */ + if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + memset(c, 0, m * n * sizeof(FLOAT)); + return 0; + } + + + c_offset = c; + + if (beta == ZERO){ + __m512d z_zero; + + z_zero = _mm512_setzero_pd(); + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = m; + + while (i > 32) { + _mm512_storeu_pd(c_offset1, z_zero); + _mm512_storeu_pd(c_offset1 + 8, z_zero); + _mm512_storeu_pd(c_offset1 + 16, z_zero); + _mm512_storeu_pd(c_offset1 + 24 , z_zero); + c_offset1 += 32; + i -= 32; + } + while (i > 8) { + _mm512_storeu_pd(c_offset1, z_zero); + c_offset1 += 8; + i -= 8; + } + + while (i > 0) { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; From adbf6afa25ca5383d48df296262bb4f2bfc0e311 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 21:18:12 +0000 Subject: [PATCH 228/935] Add vector optimizations for ncopy as well for dgemm/skylakex --- kernel/x86_64/dgemm_ncopy_8_skylakex.c | 201 ++++++++++++------------- 1 file changed, 100 insertions(+), 101 deletions(-) diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c index 3bc55b8cc..74b336f3d 100644 --- a/kernel/x86_64/dgemm_ncopy_8_skylakex.c +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -38,6 +38,7 @@ #include #include "common.h" +#include int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ BLASLONG i, j; @@ -84,131 +85,129 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ i = (m >> 3); if (i > 0){ do{ - ctemp01 = *(aoffset1 + 0); - ctemp02 = *(aoffset1 + 1); - ctemp03 = *(aoffset1 + 2); - ctemp04 = *(aoffset1 + 3); - ctemp05 = *(aoffset1 + 4); - ctemp06 = *(aoffset1 + 5); + __m128d xmm0, xmm1; + xmm0 = _mm_load_pd1(aoffset2 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 0); + _mm_storeu_pd(boffset + 0, xmm0); + ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - ctemp09 = *(aoffset2 + 0); - ctemp10 = *(aoffset2 + 1); - ctemp11 = *(aoffset2 + 2); - ctemp12 = *(aoffset2 + 3); - ctemp13 = *(aoffset2 + 4); - ctemp14 = *(aoffset2 + 5); + xmm1 = _mm_load_pd1(aoffset4 + 0); + xmm1 = _mm_loadl_pd(xmm1, aoffset3 + 0); + _mm_storeu_pd(boffset + 2, xmm1); + + xmm0 = _mm_load_pd1(aoffset6 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 0); + _mm_storeu_pd(boffset + 4, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 0); + _mm_storeu_pd(boffset + 6, xmm0); + ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - ctemp17 = *(aoffset3 + 0); - ctemp18 = *(aoffset3 + 1); - ctemp19 = *(aoffset3 + 2); - ctemp20 = *(aoffset3 + 3); - ctemp21 = *(aoffset3 + 4); - ctemp22 = *(aoffset3 + 5); + xmm0 = _mm_load_pd1(aoffset2 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 1); + _mm_storeu_pd(boffset + 8, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 1); + _mm_storeu_pd(boffset + 10, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 1); + _mm_storeu_pd(boffset + 12, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 1); + _mm_storeu_pd(boffset + 14, xmm0); + + xmm0 = _mm_load_pd1(aoffset2 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 2); + _mm_storeu_pd(boffset + 16, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 2); + _mm_storeu_pd(boffset + 18, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 2); + _mm_storeu_pd(boffset + 20, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 2); + _mm_storeu_pd(boffset + 22, xmm0); + ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); - ctemp25 = *(aoffset4 + 0); - ctemp26 = *(aoffset4 + 1); - ctemp27 = *(aoffset4 + 2); - ctemp28 = *(aoffset4 + 3); - ctemp29 = *(aoffset4 + 4); - ctemp30 = *(aoffset4 + 5); + xmm0 = _mm_load_pd1(aoffset2 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 3); + _mm_storeu_pd(boffset + 24, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 3); + _mm_storeu_pd(boffset + 26, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 3); + _mm_storeu_pd(boffset + 28, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 3); + _mm_storeu_pd(boffset + 30, xmm0); + ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); - ctemp33 = *(aoffset5 + 0); - ctemp34 = *(aoffset5 + 1); - ctemp35 = *(aoffset5 + 2); - ctemp36 = *(aoffset5 + 3); - ctemp37 = *(aoffset5 + 4); - ctemp38 = *(aoffset5 + 5); + + xmm0 = _mm_load_pd1(aoffset2 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 4); + _mm_storeu_pd(boffset + 32, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 4); + _mm_storeu_pd(boffset + 34, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 4); + _mm_storeu_pd(boffset + 36, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 4); + _mm_storeu_pd(boffset + 38, xmm0); + ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); - ctemp41 = *(aoffset6 + 0); - ctemp42 = *(aoffset6 + 1); - ctemp43 = *(aoffset6 + 2); - ctemp44 = *(aoffset6 + 3); - ctemp45 = *(aoffset6 + 4); - ctemp46 = *(aoffset6 + 5); + xmm0 = _mm_load_pd1(aoffset2 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 5); + _mm_storeu_pd(boffset + 40, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 5); + _mm_storeu_pd(boffset + 42, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 5); + _mm_storeu_pd(boffset + 44, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 5); + _mm_storeu_pd(boffset + 46, xmm0); + + ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); - ctemp49 = *(aoffset7 + 0); - ctemp50 = *(aoffset7 + 1); - ctemp51 = *(aoffset7 + 2); - ctemp52 = *(aoffset7 + 3); - ctemp53 = *(aoffset7 + 4); - ctemp54 = *(aoffset7 + 5); ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); - ctemp57 = *(aoffset8 + 0); - ctemp58 = *(aoffset8 + 1); - ctemp59 = *(aoffset8 + 2); - ctemp60 = *(aoffset8 + 3); - ctemp61 = *(aoffset8 + 4); - ctemp62 = *(aoffset8 + 5); ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); - *(boffset + 0) = ctemp01; - *(boffset + 1) = ctemp09; - *(boffset + 2) = ctemp17; - *(boffset + 3) = ctemp25; - *(boffset + 4) = ctemp33; - *(boffset + 5) = ctemp41; - *(boffset + 6) = ctemp49; - *(boffset + 7) = ctemp57; - - *(boffset + 8) = ctemp02; - *(boffset + 9) = ctemp10; - *(boffset + 10) = ctemp18; - *(boffset + 11) = ctemp26; - *(boffset + 12) = ctemp34; - *(boffset + 13) = ctemp42; - *(boffset + 14) = ctemp50; - *(boffset + 15) = ctemp58; - - *(boffset + 16) = ctemp03; - *(boffset + 17) = ctemp11; - *(boffset + 18) = ctemp19; - *(boffset + 19) = ctemp27; - *(boffset + 20) = ctemp35; - *(boffset + 21) = ctemp43; - *(boffset + 22) = ctemp51; - *(boffset + 23) = ctemp59; - - *(boffset + 24) = ctemp04; - *(boffset + 25) = ctemp12; - *(boffset + 26) = ctemp20; - *(boffset + 27) = ctemp28; - *(boffset + 28) = ctemp36; - *(boffset + 29) = ctemp44; - *(boffset + 30) = ctemp52; - *(boffset + 31) = ctemp60; - - *(boffset + 32) = ctemp05; - *(boffset + 33) = ctemp13; - *(boffset + 34) = ctemp21; - *(boffset + 35) = ctemp29; - *(boffset + 36) = ctemp37; - *(boffset + 37) = ctemp45; - *(boffset + 38) = ctemp53; - *(boffset + 39) = ctemp61; - - *(boffset + 40) = ctemp06; - *(boffset + 41) = ctemp14; - *(boffset + 42) = ctemp22; - *(boffset + 43) = ctemp30; - *(boffset + 44) = ctemp38; - *(boffset + 45) = ctemp46; - *(boffset + 46) = ctemp54; - *(boffset + 47) = ctemp62; *(boffset + 48) = ctemp07; *(boffset + 49) = ctemp15; From 582c589727302938e99bf594bf072d3d9913575e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 6 Oct 2018 23:13:26 +0000 Subject: [PATCH 229/935] dgemm/skylakex: replace discrete mul/add with fma very minor gains since it's not super hot code, but general principles --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 155 +++++++--------------- 1 file changed, 49 insertions(+), 106 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index bb121ca69..a83ca98fa 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -927,39 +927,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "jg .label24\n" /* multiply the result by alpha */ "vbroadcastsd (%[alpha]), %%zmm9\n" - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - "vmulpd %%zmm9, %%zmm11, %%zmm11\n" - "vmulpd %%zmm9, %%zmm12, %%zmm12\n" - "vmulpd %%zmm9, %%zmm13, %%zmm13\n" - "vmulpd %%zmm9, %%zmm14, %%zmm14\n" - "vmulpd %%zmm9, %%zmm15, %%zmm15\n" - "vmulpd %%zmm9, %%zmm16, %%zmm16\n" - "vmulpd %%zmm9, %%zmm17, %%zmm17\n" - "vmulpd %%zmm9, %%zmm18, %%zmm18\n" - "vmulpd %%zmm9, %%zmm21, %%zmm21\n" - "vmulpd %%zmm9, %%zmm22, %%zmm22\n" - "vmulpd %%zmm9, %%zmm23, %%zmm23\n" - "vmulpd %%zmm9, %%zmm24, %%zmm24\n" - "vmulpd %%zmm9, %%zmm25, %%zmm25\n" - "vmulpd %%zmm9, %%zmm26, %%zmm26\n" - "vmulpd %%zmm9, %%zmm27, %%zmm27\n" - "vmulpd %%zmm9, %%zmm28, %%zmm28\n" /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -969,14 +945,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" - "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" - "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" - "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" - "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" - "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" - "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" - "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" "vmovupd %%zmm11, 64(%[C0])\n" "vmovupd %%zmm12, 64(%[C1])\n" "vmovupd %%zmm13, 64(%[C2])\n" @@ -986,14 +962,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm17, 64(%[C6])\n" "vmovupd %%zmm18, 64(%[C7])\n" - "vaddpd 128(%[C0]), %%zmm21, %%zmm21\n" - "vaddpd 128(%[C1]), %%zmm22, %%zmm22\n" - "vaddpd 128(%[C2]), %%zmm23, %%zmm23\n" - "vaddpd 128(%[C3]), %%zmm24, %%zmm24\n" - "vaddpd 128(%[C4]), %%zmm25, %%zmm25\n" - "vaddpd 128(%[C5]), %%zmm26, %%zmm26\n" - "vaddpd 128(%[C6]), %%zmm27, %%zmm27\n" - "vaddpd 128(%[C7]), %%zmm28, %%zmm28\n" + "vfmadd213pd 128(%[C0]), %%zmm9, %%zmm21\n" + "vfmadd213pd 128(%[C1]), %%zmm9, %%zmm22\n" + "vfmadd213pd 128(%[C2]), %%zmm9, %%zmm23\n" + "vfmadd213pd 128(%[C3]), %%zmm9, %%zmm24\n" + "vfmadd213pd 128(%[C4]), %%zmm9, %%zmm25\n" + "vfmadd213pd 128(%[C5]), %%zmm9, %%zmm26\n" + "vfmadd213pd 128(%[C6]), %%zmm9, %%zmm27\n" + "vfmadd213pd 128(%[C7]), %%zmm9, %%zmm28\n" "vmovupd %%zmm21, 128(%[C0])\n" "vmovupd %%zmm22, 128(%[C1])\n" "vmovupd %%zmm23, 128(%[C2])\n" @@ -1108,31 +1084,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "jg .label16\n" /* multiply the result by alpha */ "vbroadcastsd (%[alpha]), %%zmm9\n" - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - "vmulpd %%zmm9, %%zmm11, %%zmm11\n" - "vmulpd %%zmm9, %%zmm12, %%zmm12\n" - "vmulpd %%zmm9, %%zmm13, %%zmm13\n" - "vmulpd %%zmm9, %%zmm14, %%zmm14\n" - "vmulpd %%zmm9, %%zmm15, %%zmm15\n" - "vmulpd %%zmm9, %%zmm16, %%zmm16\n" - "vmulpd %%zmm9, %%zmm17, %%zmm17\n" - "vmulpd %%zmm9, %%zmm18, %%zmm18\n" /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -1142,14 +1102,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" - "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" - "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" - "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" - "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" - "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" - "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" - "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" "vmovupd %%zmm11, 64(%[C0])\n" "vmovupd %%zmm12, 64(%[C1])\n" "vmovupd %%zmm13, 64(%[C2])\n" @@ -1221,24 +1181,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "add $64, %[BO]\n" "subl $1, %[kloop]\n" "jg .label1\n" - /* multiply the result by alpha */ - "vmulpd %%zmm9, %%zmm1, %%zmm1\n" - "vmulpd %%zmm9, %%zmm2, %%zmm2\n" - "vmulpd %%zmm9, %%zmm3, %%zmm3\n" - "vmulpd %%zmm9, %%zmm4, %%zmm4\n" - "vmulpd %%zmm9, %%zmm5, %%zmm5\n" - "vmulpd %%zmm9, %%zmm6, %%zmm6\n" - "vmulpd %%zmm9, %%zmm7, %%zmm7\n" - "vmulpd %%zmm9, %%zmm8, %%zmm8\n" - /* And store additively in C */ - "vaddpd (%[C0]), %%zmm1, %%zmm1\n" - "vaddpd (%[C1]), %%zmm2, %%zmm2\n" - "vaddpd (%[C2]), %%zmm3, %%zmm3\n" - "vaddpd (%[C3]), %%zmm4, %%zmm4\n" - "vaddpd (%[C4]), %%zmm5, %%zmm5\n" - "vaddpd (%[C5]), %%zmm6, %%zmm6\n" - "vaddpd (%[C6]), %%zmm7, %%zmm7\n" - "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + /* multiply the result by alpha and add to the memory */ + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" "vmovupd %%zmm1, (%[C0])\n" "vmovupd %%zmm2, (%[C1])\n" "vmovupd %%zmm3, (%[C2])\n" @@ -1247,14 +1198,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovupd %%zmm6, (%[C5])\n" "vmovupd %%zmm7, (%[C6])\n" "vmovupd %%zmm8, (%[C7])\n" - "prefetchw 64(%[C0])\n" - "prefetchw 64(%[C1])\n" - "prefetchw 64(%[C2])\n" - "prefetchw 64(%[C3])\n" - "prefetchw 64(%[C4])\n" - "prefetchw 64(%[C5])\n" - "prefetchw 64(%[C6])\n" - "prefetchw 64(%[C7])\n" : [AO] "+r" (AO), [BO] "+r" (BO), From eba394c711440ab515f80ea01bd4e72342e4719b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Oct 2018 19:18:12 +0200 Subject: [PATCH 230/935] Add -march=skylake-avx512 when required fixes #1797 --- cmake/system_check.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d339a755f..4ec4df416 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -71,6 +71,8 @@ if (X86_64 OR X86) execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +else() +set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() file(REMOVE "avx512.tmp" "avx512.o") endif() From 697dc1baf8fe8f4c8ac0ee8a1f82ee7bad7395e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Oct 2018 22:26:59 +0200 Subject: [PATCH 231/935] Use override for ARCH in make.inc in case a conflicting setting of ARCH (for architecture) gets pulled in from the environment (originally suggested by dloghin in #1753) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b947c1198..8ac77c729 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc From d3d58f8ee538f240b14abb4a9e9beffb8a495415 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Oct 2018 22:29:35 +0200 Subject: [PATCH 232/935] Catch conflicting usage of ARCH in at least some BSD environments fixes #1796 --- Makefile.system | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.system b/Makefile.system index 4712d9525..53537eb09 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,11 @@ ifndef TOPDIR TOPDIR = . endif +# Catch conflicting usage of ARCH in some BSD environments +ifeq ($(ARCH), amd64) +override ARCH=x86_64 +endif + NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # Default C compiler From d4c8853a029175d6064a09341201f776c32440b3 Mon Sep 17 00:00:00 2001 From: fengrl <42458138+fengrl@users.noreply.github.com> Date: Tue, 9 Oct 2018 11:20:16 +0800 Subject: [PATCH 233/935] Update common_mips64.h --- common_mips64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_mips64.h b/common_mips64.h index 93bc7e519..1163413dc 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){ #define RPCC_DEFINED #ifndef NO_AFFINITY -#define WHEREAMI +//#define WHEREAMI static inline int WhereAmI(void){ int ret=0; __asm__ __volatile__(".set push \n" From 6234a326569041cc2f3fa667c6f70402c056237f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 9 Oct 2018 10:31:59 +0200 Subject: [PATCH 234/935] Use cygwin compilation workaround for avx512 on msys2/mingw64 as well --- Makefile.x86_64 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f831b5040..f2647fb7d 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -15,6 +15,11 @@ FCOMMON_OPT += -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif endif endif From d4bad73834a9e1abf23e3c0a8f4e9a84e9137881 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 10 Oct 2018 01:49:22 +0000 Subject: [PATCH 235/935] Add a C+intrinsics version of the SGEMM/skylakex kernel for most sizes this is 1.2x to 1.4x faster than the current code --- kernel/x86_64/sgemm_beta_skylakex.c | 150 ++ kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 1726 ++++++++++++++++++++ kernel/x86_64/sgemm_ncopy_4_skylakex.c | 207 +++ kernel/x86_64/sgemm_tcopy_16_skylakex.c | 387 +++++ 4 files changed, 2470 insertions(+) create mode 100644 kernel/x86_64/sgemm_beta_skylakex.c create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.c create mode 100644 kernel/x86_64/sgemm_ncopy_4_skylakex.c create mode 100644 kernel/x86_64/sgemm_tcopy_16_skylakex.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c new file mode 100644 index 000000000..b1bf4d77a --- /dev/null +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + /* fast path.. just zero the whole matrix */ + if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + memset(c, 0, m * n * sizeof(FLOAT)); + return 0; + } + + + c_offset = c; + + if (beta == ZERO){ + __m512 z_zero; + + z_zero = _mm512_setzero_ps(); + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = m; + + while (i > 32) { + _mm512_storeu_ps(c_offset1, z_zero); + _mm512_storeu_ps(c_offset1 + 8, z_zero); + _mm512_storeu_ps(c_offset1 + 16, z_zero); + _mm512_storeu_ps(c_offset1 + 24 , z_zero); + c_offset1 += 32; + i -= 32; + } + while (i > 8) { + _mm512_storeu_ps(c_offset1, z_zero); + c_offset1 += 8; + i -= 8; + } + + while (i > 0) { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c new file mode 100644 index 000000000..b2b1ab03f --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -0,0 +1,1726 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/* comment below left for history, data does not represent the implementation in this file */ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#include "common.h" +#include + + + +/******************************************************************************************* +* 8 lines of N +*******************************************************************************************/ + + + +#define INIT32x8() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row4 = _mm512_setzero_ps(); \ + row5 = _mm512_setzero_ps(); \ + row6 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row4b = _mm512_setzero_ps(); \ + row5b = _mm512_setzero_ps(); \ + row6b = _mm512_setzero_ps(); \ + row7b = _mm512_setzero_ps(); \ + +#define KERNEL32x8_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm0b = _mm512_loadu_ps(AOb); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm0b * zmm2; \ + row1b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm0b * zmm2; \ + row3b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += zmm0 * zmm2; \ + row5 += zmm0 * zmm3; \ + row4b += zmm0b * zmm2; \ + row5b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += zmm0 * zmm2; \ + row7 += zmm0 * zmm3; \ + row6b += zmm0b * zmm2; \ + row7b += zmm0b * zmm3; \ + BO += 8; \ + AO += 16; \ + AOb += 16; + + +#define SAVE32x8(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row4 *= zmm0; \ + row5 *= zmm0; \ + row6 *= zmm0; \ + row7 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row4b *= zmm0; \ + row5b *= zmm0; \ + row6b *= zmm0; \ + row7b *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm512_storeu_ps(CO1 + 7 * ldc, row7); \ + row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ + row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ + row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ + row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ + row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ + _mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ + _mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ + _mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ + _mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ + _mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ + + +#define INIT16x8() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row4 = _mm512_setzero_ps(); \ + row5 = _mm512_setzero_ps(); \ + row6 = _mm512_setzero_ps(); \ + row7 = _mm512_setzero_ps(); \ + +#define KERNEL16x8_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += zmm0 * zmm2; \ + row5 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += zmm0 * zmm2; \ + row7 += zmm0 * zmm3; \ + BO += 8; \ + AO += 16; + + +#define SAVE16x8(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row4 *= zmm0; \ + row5 *= zmm0; \ + row6 *= zmm0; \ + row7 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm512_storeu_ps(CO1 + 7 * ldc, row7); + + + +/*******************************************************************************************/ + +#define INIT8x8() \ + row0 = _mm256_setzero_ps(); \ + row1 = _mm256_setzero_ps(); \ + row2 = _mm256_setzero_ps(); \ + row3 = _mm256_setzero_ps(); \ + row4 = _mm256_setzero_ps(); \ + row5 = _mm256_setzero_ps(); \ + row6 = _mm256_setzero_ps(); \ + row7 = _mm256_setzero_ps(); \ + +#define KERNEL8x8_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += ymm0 * ymm2; \ + row1 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += ymm0 * ymm2; \ + row3 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += ymm0 * ymm2; \ + row5 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += ymm0 * ymm2; \ + row7 += ymm0 * ymm3; \ + BO += 8; \ + AO += 8; + + +#define SAVE8x8(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + row0 *= ymm0; \ + row1 *= ymm0; \ + row2 *= ymm0; \ + row3 *= ymm0; \ + row4 *= ymm0; \ + row5 *= ymm0; \ + row6 *= ymm0; \ + row7 *= ymm0; \ + row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ + _mm256_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm256_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm256_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm256_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm256_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm256_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm256_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm256_storeu_ps(CO1 + 7 * ldc, row7); \ + + + +/*******************************************************************************************/ + +#define INIT4x8() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + row2 = _mm_setzero_ps(); \ + row3 = _mm_setzero_ps(); \ + row4 = _mm_setzero_ps(); \ + row5 = _mm_setzero_ps(); \ + row6 = _mm_setzero_ps(); \ + row7 = _mm_setzero_ps(); \ + + +#define KERNEL4x8_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += xmm0 * xmm2; \ + row5 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += xmm0 * xmm2; \ + row7 += xmm0 * xmm3; \ + BO += 8; \ + AO += 4; + + +#define SAVE4x8(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row4 *= xmm0; \ + row5 *= xmm0; \ + row6 *= xmm0; \ + row7 *= xmm0; \ + row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ + _mm_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm_storeu_ps(CO1 + 7 * ldc, row7); \ + + +/*******************************************************************************************/ + +#define INIT2x8() \ + row0a = row0b = 0; \ + row1a = row1b = 0; \ + row2a = row2b = 0; \ + row3a = row3b = 0; \ + row4a = row4b = 0; \ + row5a = row5b = 0; \ + row6a = row6b = 0; \ + row7a = row7b = 0; \ + +#define KERNEL2x8_SUB() \ + xmm0 = *(AO); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0a += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1a += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2a += xmm0 * xmm2; \ + row2b += xmm1 * xmm2; \ + row3a += xmm0 * xmm3; \ + row3b += xmm1 * xmm3; \ + xmm2 = *(BO + 4); \ + xmm3 = *(BO + 5); \ + row4a += xmm0 * xmm2; \ + row4b += xmm1 * xmm2; \ + row5a += xmm0 * xmm3; \ + row5b += xmm1 * xmm3; \ + xmm2 = *(BO + 6); \ + xmm3 = *(BO + 7); \ + row6a += xmm0 * xmm2; \ + row6b += xmm1 * xmm2; \ + row7a += xmm0 * xmm3; \ + row7b += xmm1 * xmm3; \ + BO += 8; \ + AO += 2; + + +#define SAVE2x8(ALPHA) \ + xmm0 = ALPHA; \ + row0a *= xmm0; \ + row0b *= xmm0; \ + row1a *= xmm0; \ + row1b *= xmm0; \ + row2a *= xmm0; \ + row2b *= xmm0; \ + row3a *= xmm0; \ + row3b *= xmm0; \ + row4a *= xmm0; \ + row4b *= xmm0; \ + row5a *= xmm0; \ + row5b *= xmm0; \ + row6a *= xmm0; \ + row6b *= xmm0; \ + row7a *= xmm0; \ + row7b *= xmm0; \ + *(CO1 + 0 * ldc + 0) += row0a; \ + *(CO1 + 0 * ldc + 1) += row0b; \ + *(CO1 + 1 * ldc + 0) += row1a; \ + *(CO1 + 1 * ldc + 1) += row1b; \ + *(CO1 + 2 * ldc + 0) += row2a; \ + *(CO1 + 2 * ldc + 1) += row2b; \ + *(CO1 + 3 * ldc + 0) += row3a; \ + *(CO1 + 3 * ldc + 1) += row3b; \ + *(CO1 + 4 * ldc + 0) += row4a; \ + *(CO1 + 4 * ldc + 1) += row4b; \ + *(CO1 + 5 * ldc + 0) += row5a; \ + *(CO1 + 5 * ldc + 1) += row5b; \ + *(CO1 + 6 * ldc + 0) += row6a; \ + *(CO1 + 6 * ldc + 1) += row6b; \ + *(CO1 + 7 * ldc + 0) += row7a; \ + *(CO1 + 7 * ldc + 1) += row7b; \ + + + +/*******************************************************************************************/ + +#define INIT1x8() \ + row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; + +#define KERNEL1x8_SUB() \ + xmm0 = *(AO ); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + xmm2 = *(BO + 4); \ + xmm3 = *(BO + 5); \ + row4 += xmm0 * xmm2; \ + row5 += xmm0 * xmm3; \ + xmm2 = *(BO + 6); \ + xmm3 = *(BO + 7); \ + row6 += xmm0 * xmm2; \ + row7 += xmm0 * xmm3; \ + BO += 8; \ + AO += 1; + + +#define SAVE1x8(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row4 *= xmm0; \ + row5 *= xmm0; \ + row6 *= xmm0; \ + row7 *= xmm0; \ + *(CO1 + 0 * ldc) += row0; \ + *(CO1 + 1 * ldc) += row1; \ + *(CO1 + 2 * ldc) += row2; \ + *(CO1 + 3 * ldc) += row3; \ + *(CO1 + 4 * ldc) += row4; \ + *(CO1 + 5 * ldc) += row5; \ + *(CO1 + 6 * ldc) += row6; \ + *(CO1 + 7 * ldc) += row7; \ + + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +#define INIT64x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + row0d = _mm512_setzero_ps(); \ + row1d = _mm512_setzero_ps(); \ + row2d = _mm512_setzero_ps(); \ + row3d = _mm512_setzero_ps(); \ + +#define KERNEL64x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm7 = _mm512_loadu_ps(A3); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + row0d += zmm7 * zmm2; \ + row1d += zmm7 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + row2d += zmm7 * zmm2; \ + row3d += zmm7 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; \ + A3 += 16; \ + + +#define SAVE64x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0d *= zmm0; \ + row1d *= zmm0; \ + row2d *= zmm0; \ + row3d *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); \ + row0d += _mm512_loadu_ps(CO1 + 0*ldc + 48); \ + row1d += _mm512_loadu_ps(CO1 + 1*ldc + 48); \ + row2d += _mm512_loadu_ps(CO1 + 2*ldc + 48); \ + row3d += _mm512_loadu_ps(CO1 + 3*ldc + 48); \ + _mm512_storeu_ps(CO1 + 0*ldc + 48, row0d); \ + _mm512_storeu_ps(CO1 + 1*ldc + 48, row1d); \ + _mm512_storeu_ps(CO1 + 2*ldc + 48, row2d); \ + _mm512_storeu_ps(CO1 + 3*ldc + 48, row3d); + + +#define INIT48x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + +#define KERNEL48x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; + + +#define SAVE48x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); + + +#define INIT32x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + +#define KERNEL32x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; + + +#define SAVE32x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); + + + +#define INIT16x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + +#define KERNEL16x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + BO += 4; \ + AO += 16; + + +#define SAVE16x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); + + + +/*******************************************************************************************/ + +#define INIT8x4() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + ymm8 = _mm256_setzero_ps(); \ + ymm10 = _mm256_setzero_ps(); \ + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ + ymm8 += ymm0 * ymm2; \ + ymm10 += ymm0 * ymm3; \ + BO += 4; \ + AO += 8; + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm8 *= ymm0; \ + ymm10 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1 + 0 * ldc); \ + ymm6 += _mm256_loadu_ps(CO1 + 1 * ldc); \ + ymm8 += _mm256_loadu_ps(CO1 + 2 * ldc); \ + ymm10 += _mm256_loadu_ps(CO1 + 3 * ldc); \ + _mm256_storeu_ps(CO1 + 0 * ldc, ymm4); \ + _mm256_storeu_ps(CO1 + 1 * ldc, ymm6); \ + _mm256_storeu_ps(CO1 + 2 * ldc, ymm8); \ + _mm256_storeu_ps(CO1 + 3 * ldc, ymm10); \ + + + +/*******************************************************************************************/ + +#define INIT4x4() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + row2 = _mm_setzero_ps(); \ + row3 = _mm_setzero_ps(); \ + + +#define KERNEL4x4_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 4; + + +#define SAVE4x4(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ + _mm_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm_storeu_ps(CO1 + 3 * ldc, row3); \ + + +/*******************************************************************************************/ + +#define INIT2x4() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + row2 = 0; row2b = 0; row3 = 0; row3b = 0; + +#define KERNEL2x4_SUB() \ + xmm0 = *(AO); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row2b += xmm1 * xmm2; \ + row3 += xmm0 * xmm3; \ + row3b += xmm1 * xmm3; \ + BO += 4; \ + AO += 2; + + +#define SAVE2x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + row2 *= xmm0; \ + row2b *= xmm0; \ + row3 *= xmm0; \ + row3b *= xmm0; \ + *(CO1 + 0 * ldc + 0) += row0; \ + *(CO1 + 0 * ldc + 1) += row0b; \ + *(CO1 + 1 * ldc + 0) += row1; \ + *(CO1 + 1 * ldc + 1) += row1b; \ + *(CO1 + 2 * ldc + 0) += row2; \ + *(CO1 + 2 * ldc + 1) += row2b; \ + *(CO1 + 3 * ldc + 0) += row3; \ + *(CO1 + 3 * ldc + 1) += row3b; \ + + + +/*******************************************************************************************/ + +#define INIT1x4() \ + row0 = 0; row1 = 0; row2 = 0; row3 = 0; +#define KERNEL1x4_SUB() \ + xmm0 = *(AO ); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + *(CO1 + 0 * ldc) += row0; \ + *(CO1 + 1 * ldc) += row1; \ + *(CO1 + 2 * ldc) += row2; \ + *(CO1 + 3 * ldc) += row3; \ + + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define INIT16x2() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + + +#define KERNEL16x2_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + BO += 2; \ + AO += 16; + + +#define SAVE16x2(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + row1 += _mm512_loadu_ps(CO1 + ldc); \ + _mm512_storeu_ps(CO1 , row0); \ + _mm512_storeu_ps(CO1 + ldc, row1); \ + + + + +/*******************************************************************************************/ + +#define INIT8x2() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + BO += 2; \ + AO += 8; + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + ymm6 += _mm256_loadu_ps(CO1 + ldc); \ + _mm256_storeu_ps(CO1 , ymm4); \ + _mm256_storeu_ps(CO1 + ldc, ymm6); \ + + + +/*******************************************************************************************/ + +#define INIT4x2() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 4; + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + row1 += _mm_loadu_ps(CO1 + ldc); \ + _mm_storeu_ps(CO1 , row0); \ + _mm_storeu_ps(CO1 + ldc, row1); \ + + + +/*******************************************************************************************/ + + +#define INIT2x2() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + +#define KERNEL2x2_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + BO += 2; \ + AO += 2; \ + + +#define SAVE2x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + *(CO1 + ldc ) += row1; \ + *(CO1 + ldc +1) += row1b; \ + + +/*******************************************************************************************/ + +#define INIT1x2() \ + row0 = 0; row1 = 0; + +#define KERNEL1x2_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 + ldc ) += row1; \ + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define INIT16x1() \ + row0 = _mm512_setzero_ps(); \ + +#define KERNEL16x1_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += zmm0 * zmm2; \ + BO += 1; \ + AO += 16; + + +#define SAVE16x1(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + _mm512_storeu_ps(CO1 , row0); \ + + +/*******************************************************************************************/ + +#define INIT8x1() \ + ymm4 = _mm256_setzero_ps(); + +#define KERNEL8x1_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + _mm256_storeu_ps(CO1 , ymm4); \ + + +/*******************************************************************************************/ + +#define INIT4x1() \ + row0 = _mm_setzero_ps(); \ + +#define KERNEL4x1_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + _mm_storeu_ps(CO1 , row0); \ + + + +/*******************************************************************************************/ + +#define INIT2x1() \ + row0 = 0; row0b = 0; + +#define KERNEL2x1_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + + +/*******************************************************************************************/ + +#define INIT1x1() \ + row0 = 0; + +#define KERNEL1x1_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 1; + + +#define SAVE1x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + *(CO1 ) += row0; \ + + +/*******************************************************************************************/ + + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) +{ + unsigned long M = m, N = n, K = k; + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + + + // L8_0 + while (N >= 8 && 0) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 8 * ldc; + + AO = A; + + i = m; + + while (i >= 32 && 0) { + float *BO, *AOb; + // L8_11 + __m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; + BO = B; + int kloop = K; + AOb = AO + 16 * K; + + INIT32x8() + + while (kloop > 0) { + // L12_17 + KERNEL32x8_SUB() + kloop--; + } + // L8_19 + SAVE32x8(alpha) + CO1 += 32; + AO += 16 * K; + + i -= 32; + } + while (i >= 16) { + float *BO; + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT16x8() + + while (kloop > 0) { + KERNEL16x8_SUB() + kloop--; + } + SAVE16x8(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + // L8_11 + __m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT8x8() + + while (kloop > 0) { + // L12_17 + KERNEL8x8_SUB() + kloop--; + } + // L8_19 + SAVE8x8(alpha) + CO1 += 8; + + i -= 8; + } + while (i >= 4) { + // L8_11 + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT4x8() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x8_SUB() + kloop--; + } + // L8_19 + SAVE4x8(alpha) + CO1 += 4; + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; + BO = B; + + INIT2x8() + int kloop = K; + + while (kloop > 0) { + KERNEL2x8_SUB() + kloop--; + } + SAVE2x8(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; + int kloop = K; + BO = B; + INIT1x8() + + while (kloop > 0) { + KERNEL1x8_SUB() + kloop--; + } + SAVE1x8(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 8; + N -= 8; + } + + while (N >= 4) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A; + + i = m; + while (i >= 64) { + float *BO; + float *A1, *A2, *A3; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, zmm5, row1, zmm7, row2, row3, row0b, row1b, row2b, row3b, row0c, row1c, row2c, row3c, row0d, row1d, row2d, row3d; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + A2 = A1 + 16 * K; + A3 = A2 + 16 * K; + + INIT64x4() + + while (kloop > 0) { + // L12_17 + KERNEL64x4_SUB() + kloop--; + } + // L8_19 + SAVE64x4(alpha) + CO1 += 64; + AO += 48 * K; + + i -= 64; + } + while (i >= 32) { + float *BO; + float *A1; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, row1, row2, row3, row0b, row1b, row2b, row3b; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + + INIT32x4() + + while (kloop > 0) { + // L12_17 + KERNEL32x4_SUB() + kloop--; + } + // L8_19 + SAVE32x4(alpha) + CO1 += 32; + AO += 16 * K; + + i -= 32; + } + while (i >= 16) { + float *BO; + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT16x4() + + while (kloop > 0) { + // L12_17 + KERNEL16x4_SUB() + kloop--; + } + // L8_19 + SAVE16x4(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + // L8_11 + __m256 ymm0, ymm2, ymm3, ymm4, ymm6,ymm8,ymm10; + BO = B; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + CO1 += 8; + + i -= 8; + } + while (i >= 4) { + // L8_11 + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + CO1 += 4; + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b, row2, row2b, row3, row3b; + BO = B; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1, row2, row3; + int kloop = K; + BO = B; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1; + BO = B; + int kloop = K; + + INIT16x2() + + while (kloop > 0) { + // L12_17 + KERNEL16x2_SUB() + kloop--; + } + // L8_19 + SAVE16x2(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm3, ymm4, ymm6; + // L8_11 + BO = B; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + CO1 += 8; + + i-=8; + } + + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1; + // L8_11 + BO = B; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + CO1 += 4; + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b; + int kloop = K; + BO = B; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1; + int kloop = K; + BO = B; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + float *CO1; + float *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + __m512 zmm0, zmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT16x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL16x1_SUB() + kloop--; + } + // L8_19 + SAVE16x1(alpha) + CO1 += 16; + + i-= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm4; + // L8_11 + BO = B; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + CO1 += 8; + + i-= 8; + } + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + CO1 += 4; + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, row0, row0b; + int kloop = K; + BO = B; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, row0; + int kloop = K; + + BO = B; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c new file mode 100644 index 000000000..8577e3b38 --- /dev/null +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#include + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + __m128 row0, row1, row2, row3; + + row0 = _mm_loadu_ps(a_offset1); + row1 = _mm_loadu_ps(a_offset2); + row2 = _mm_loadu_ps(a_offset3); + row3 = _mm_loadu_ps(a_offset4); + + _MM_TRANSPOSE4_PS(row0, row1, row2, row3); + + _mm_storeu_ps(b_offset + 0, row0); + _mm_storeu_ps(b_offset + 4, row1); + _mm_storeu_ps(b_offset + 8, row2); + _mm_storeu_ps(b_offset + 12, row3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/x86_64/sgemm_tcopy_16_skylakex.c b/kernel/x86_64/sgemm_tcopy_16_skylakex.c new file mode 100644 index 000000000..dbacc5081 --- /dev/null +++ b/kernel/x86_64/sgemm_tcopy_16_skylakex.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} From 84bcdf9c661fb7484fd9a95c292115234213497a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Oct 2018 19:15:32 +0200 Subject: [PATCH 236/935] Revert "Add -march=skylake-avx512 when required" --- cmake/system_check.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 9b8a3d39d..fe30c7600 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -71,8 +71,6 @@ if (X86_64 OR X86) execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") -else() -set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() file(REMOVE "avx512.tmp" "avx512.o") endif() From fa53b903db657b0d5f5bfe5554c7218442c539c9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Oct 2018 19:22:01 +0200 Subject: [PATCH 237/935] Add -march=skylake-avx512 to CFLAGS when the target is Skylake Should fix 1806 and #1801 --- cmake/system.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 18b2c3b87..4dc50e64f 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -41,6 +41,11 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () +if (DEFINED TARGET AND ${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") + set (FCOMMON_OPT "${FCOMMON_OPT} -march=skylake-avx512") +endif() + if (DEFINED TARGET) message(STATUS "Targeting the ${TARGET} architecture.") set(GETARCH_FLAGS "-DFORCE_${TARGET}") From 8a11ec19d1e4b5b8693f90b1932fb363e56c1200 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Oct 2018 23:47:35 +0200 Subject: [PATCH 238/935] Syntax fix --- cmake/system.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4dc50e64f..097e1cd5e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -41,10 +41,12 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () -if (DEFINED TARGET AND ${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) +if (DEFINED TARGET) +if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") set (FCOMMON_OPT "${FCOMMON_OPT} -march=skylake-avx512") endif() +endif() if (DEFINED TARGET) message(STATUS "Targeting the ${TARGET} architecture.") From 81c9985c3ad1a7a42c1ef5d7277050ecba470def Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Oct 2018 11:03:27 +0200 Subject: [PATCH 239/935] Use KERNEL_DEFINITIONS rather than COMMON_OPTS to pass -march=skylake-avx512 --- cmake/system.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 097e1cd5e..61f96edb0 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -43,8 +43,7 @@ endif () if (DEFINED TARGET) if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") - set (FCOMMON_OPT "${FCOMMON_OPT} -march=skylake-avx512") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() endif() From 55b244ca0da907b27c4e0306df0a1a90a2238c6a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 12 Oct 2018 09:30:35 +0000 Subject: [PATCH 240/935] enable the SGEMM/SKX C based kernel In QA the final bug was found so now the sklyakex sgemm C based kernel can be activated.... --- kernel/x86_64/KERNEL.SKYLAKEX | 9 +- kernel/x86_64/sgemm_beta_skylakex.c | 6 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 549 --------------------- 3 files changed, 10 insertions(+), 554 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 48c81e80b..acc6356d6 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,6 +1,11 @@ include $(KERNELDIR)/KERNEL.HASWELL -SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c + +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_skylakex.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c @@ -9,5 +14,5 @@ DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c -SGEMM_BETA = ../generic/gemm_beta.c +SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index b1bf4d77a..54f9664e9 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -60,8 +60,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, if (beta == ZERO){ __m512 z_zero; + __m256 y_zero; z_zero = _mm512_setzero_ps(); + y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -71,14 +73,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, while (i > 32) { _mm512_storeu_ps(c_offset1, z_zero); - _mm512_storeu_ps(c_offset1 + 8, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); - _mm512_storeu_ps(c_offset1 + 24 , z_zero); c_offset1 += 32; i -= 32; } while (i > 8) { - _mm512_storeu_ps(c_offset1, z_zero); + _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index b2b1ab03f..10d3d22ed 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define INIT32x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row0b = _mm512_setzero_ps(); \ - row1b = _mm512_setzero_ps(); \ - row2b = _mm512_setzero_ps(); \ - row3b = _mm512_setzero_ps(); \ - row4b = _mm512_setzero_ps(); \ - row5b = _mm512_setzero_ps(); \ - row6b = _mm512_setzero_ps(); \ - row7b = _mm512_setzero_ps(); \ - -#define KERNEL32x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm0b = _mm512_loadu_ps(AOb); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - row0b += zmm0b * zmm2; \ - row1b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - row2b += zmm0b * zmm2; \ - row3b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - row4b += zmm0b * zmm2; \ - row5b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - row6b += zmm0b * zmm2; \ - row7b += zmm0b * zmm3; \ - BO += 8; \ - AO += 16; \ - AOb += 16; - - -#define SAVE32x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0b *= zmm0; \ - row1b *= zmm0; \ - row2b *= zmm0; \ - row3b *= zmm0; \ - row4b *= zmm0; \ - row5b *= zmm0; \ - row6b *= zmm0; \ - row7b *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); \ - row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ - row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ - row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ - row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ - row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ - row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ - row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ - row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ - _mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ - _mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ - _mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ - _mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ - _mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ - _mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ - _mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ - _mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ - - -#define INIT16x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row7 = _mm512_setzero_ps(); \ - -#define KERNEL16x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - BO += 8; \ - AO += 16; - - -#define SAVE16x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); - - - -/*******************************************************************************************/ - -#define INIT8x8() \ - row0 = _mm256_setzero_ps(); \ - row1 = _mm256_setzero_ps(); \ - row2 = _mm256_setzero_ps(); \ - row3 = _mm256_setzero_ps(); \ - row4 = _mm256_setzero_ps(); \ - row5 = _mm256_setzero_ps(); \ - row6 = _mm256_setzero_ps(); \ - row7 = _mm256_setzero_ps(); \ - -#define KERNEL8x8_SUB() \ - ymm0 = _mm256_loadu_ps(AO); \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += ymm0 * ymm2; \ - row1 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += ymm0 * ymm2; \ - row3 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += ymm0 * ymm2; \ - row5 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += ymm0 * ymm2; \ - row7 += ymm0 * ymm3; \ - BO += 8; \ - AO += 8; - - -#define SAVE8x8(ALPHA) \ - ymm0 = _mm256_set1_ps(ALPHA); \ - row0 *= ymm0; \ - row1 *= ymm0; \ - row2 *= ymm0; \ - row3 *= ymm0; \ - row4 *= ymm0; \ - row5 *= ymm0; \ - row6 *= ymm0; \ - row7 *= ymm0; \ - row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ - _mm256_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm256_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm256_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm256_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm256_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm256_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm256_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm256_storeu_ps(CO1 + 7 * ldc, row7); \ - - - -/*******************************************************************************************/ - -#define INIT4x8() \ - row0 = _mm_setzero_ps(); \ - row1 = _mm_setzero_ps(); \ - row2 = _mm_setzero_ps(); \ - row3 = _mm_setzero_ps(); \ - row4 = _mm_setzero_ps(); \ - row5 = _mm_setzero_ps(); \ - row6 = _mm_setzero_ps(); \ - row7 = _mm_setzero_ps(); \ - - -#define KERNEL4x8_SUB() \ - xmm0 = _mm_loadu_ps(AO); \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 4; - - -#define SAVE4x8(ALPHA) \ - xmm0 = _mm_set1_ps(ALPHA); \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ - _mm_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm_storeu_ps(CO1 + 7 * ldc, row7); \ - - -/*******************************************************************************************/ - -#define INIT2x8() \ - row0a = row0b = 0; \ - row1a = row1b = 0; \ - row2a = row2b = 0; \ - row3a = row3b = 0; \ - row4a = row4b = 0; \ - row5a = row5b = 0; \ - row6a = row6b = 0; \ - row7a = row7b = 0; \ - -#define KERNEL2x8_SUB() \ - xmm0 = *(AO); \ - xmm1 = *(AO + 1); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0a += xmm0 * xmm2; \ - row0b += xmm1 * xmm2; \ - row1a += xmm0 * xmm3; \ - row1b += xmm1 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2a += xmm0 * xmm2; \ - row2b += xmm1 * xmm2; \ - row3a += xmm0 * xmm3; \ - row3b += xmm1 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4a += xmm0 * xmm2; \ - row4b += xmm1 * xmm2; \ - row5a += xmm0 * xmm3; \ - row5b += xmm1 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6a += xmm0 * xmm2; \ - row6b += xmm1 * xmm2; \ - row7a += xmm0 * xmm3; \ - row7b += xmm1 * xmm3; \ - BO += 8; \ - AO += 2; - - -#define SAVE2x8(ALPHA) \ - xmm0 = ALPHA; \ - row0a *= xmm0; \ - row0b *= xmm0; \ - row1a *= xmm0; \ - row1b *= xmm0; \ - row2a *= xmm0; \ - row2b *= xmm0; \ - row3a *= xmm0; \ - row3b *= xmm0; \ - row4a *= xmm0; \ - row4b *= xmm0; \ - row5a *= xmm0; \ - row5b *= xmm0; \ - row6a *= xmm0; \ - row6b *= xmm0; \ - row7a *= xmm0; \ - row7b *= xmm0; \ - *(CO1 + 0 * ldc + 0) += row0a; \ - *(CO1 + 0 * ldc + 1) += row0b; \ - *(CO1 + 1 * ldc + 0) += row1a; \ - *(CO1 + 1 * ldc + 1) += row1b; \ - *(CO1 + 2 * ldc + 0) += row2a; \ - *(CO1 + 2 * ldc + 1) += row2b; \ - *(CO1 + 3 * ldc + 0) += row3a; \ - *(CO1 + 3 * ldc + 1) += row3b; \ - *(CO1 + 4 * ldc + 0) += row4a; \ - *(CO1 + 4 * ldc + 1) += row4b; \ - *(CO1 + 5 * ldc + 0) += row5a; \ - *(CO1 + 5 * ldc + 1) += row5b; \ - *(CO1 + 6 * ldc + 0) += row6a; \ - *(CO1 + 6 * ldc + 1) += row6b; \ - *(CO1 + 7 * ldc + 0) += row7a; \ - *(CO1 + 7 * ldc + 1) += row7b; \ - - - -/*******************************************************************************************/ - -#define INIT1x8() \ - row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; - -#define KERNEL1x8_SUB() \ - xmm0 = *(AO ); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 1; - - -#define SAVE1x8(ALPHA) \ - xmm0 = ALPHA; \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - *(CO1 + 0 * ldc) += row0; \ - *(CO1 + 1 * ldc) += row1; \ - *(CO1 + 2 * ldc) += row2; \ - *(CO1 + 3 * ldc) += row3; \ - *(CO1 + 4 * ldc) += row4; \ - *(CO1 + 5 * ldc) += row5; \ - *(CO1 + 6 * ldc) += row6; \ - *(CO1 + 7 * ldc) += row7; \ @@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; - - // L8_0 - while (N >= 8 && 0) { - float *CO1; - float *AO; - int i; - // L8_10 - CO1 = C; - C += 8 * ldc; - - AO = A; - - i = m; - - while (i >= 32 && 0) { - float *BO, *AOb; - // L8_11 - __m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - int kloop = K; - AOb = AO + 16 * K; - - INIT32x8() - - while (kloop > 0) { - // L12_17 - KERNEL32x8_SUB() - kloop--; - } - // L8_19 - SAVE32x8(alpha) - CO1 += 32; - AO += 16 * K; - - i -= 32; - } - while (i >= 16) { - float *BO; - // L8_11 - __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT16x8() - - while (kloop > 0) { - KERNEL16x8_SUB() - kloop--; - } - SAVE16x8(alpha) - CO1 += 16; - - i -= 16; - } - while (i >= 8) { - float *BO; - // L8_11 - __m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT8x8() - - while (kloop > 0) { - // L12_17 - KERNEL8x8_SUB() - kloop--; - } - // L8_19 - SAVE8x8(alpha) - CO1 += 8; - - i -= 8; - } - while (i >= 4) { - // L8_11 - float *BO; - __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT4x8() - // L8_16 - while (kloop > 0) { - // L12_17 - KERNEL4x8_SUB() - kloop--; - } - // L8_19 - SAVE4x8(alpha) - CO1 += 4; - - i -= 4; - } - -/************************************************************************** -* Rest of M -***************************************************************************/ - - while (i >= 2) { - float *BO; - float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - - INIT2x8() - int kloop = K; - - while (kloop > 0) { - KERNEL2x8_SUB() - kloop--; - } - SAVE2x8(alpha) - CO1 += 2; - i -= 2; - } - // L13_40 - while (i >= 1) { - float *BO; - float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - int kloop = K; - BO = B; - INIT1x8() - - while (kloop > 0) { - KERNEL1x8_SUB() - kloop--; - } - SAVE1x8(alpha) - CO1 += 1; - i -= 1; - } - - B += K * 8; - N -= 8; - } - while (N >= 4) { float *CO1; float *AO; From c3d93caa8d58e18422014c3ceb4f49ea73cd1f96 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:01:27 -0700 Subject: [PATCH 241/935] ARM64: Remove dependency of XGENE1 Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.XGENE1 | 136 ++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 index 6ee0c730c..d05754628 100644 --- a/kernel/arm64/KERNEL.XGENE1 +++ b/kernel/arm64/KERNEL.XGENE1 @@ -1 +1,135 @@ -include $(KERNELDIR)/KERNEL.ARMV8 \ No newline at end of file +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + From 162e31283276a7c108968f3309e2e3371b639bc3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:01:45 -0700 Subject: [PATCH 242/935] ARM64: Remove dependency of CORTEXA57 Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.CORTEXA57 | 47 ++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 371e488cd..2fd2c3d87 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -1,4 +1,49 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S From 8001fdcd2a6796c0747e5df25c38a082c0261b0f Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:02:16 -0700 Subject: [PATCH 243/935] ARM64: Remove dependency of THUNDERX Makefile on ARMV8 Makefile --- kernel/arm64/KERNEL.THUNDERX | 135 +++++++++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index 11b7a2ca8..e19655e8c 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -1,6 +1,133 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx.c +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot_thunderx.c +DDOTKERNEL = ddot_thunderx.c +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SDOTKERNEL=dot_thunderx.c -DDOTKERNEL=ddot_thunderx.c -DAXPYKERNEL=daxpy_thunderx.c From caf339412f9e828ffd3e43ec4b58ecd992eeff7a Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:02:40 -0700 Subject: [PATCH 244/935] ARM64: Remove dependency of THUNDERX2T99 Makefile on CORTEXA57 Makefile --- kernel/arm64/KERNEL.THUNDERX2T99 | 137 ++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index b66cd0e8b..a73d4cee8 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -1,4 +1,137 @@ -include $(KERNELDIR)/KERNEL.CORTEXA57 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c @@ -27,12 +160,12 @@ CNRM2KERNEL = scnrm2_thunderx2t99.c DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DAXPYKERNEL = daxpy_thunderx2t99.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S From 21f46a1cf2cefbdedf89878e3a6324578d0fe8ca Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:11:27 -0700 Subject: [PATCH 245/935] ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8 Currently the generic ARMV8 target uses C implementations for many routines. Replace these with the neon implementations written for THUNDERX2T99 target which are upto 6x faster for certain routines. --- driver/others/parameter.c | 4 +- interface/swap.c | 2 +- kernel/arm64/KERNEL.ARMV8 | 220 ++++++++++++++++++++++++++------------ param.h | 47 ++++++-- 4 files changed, 196 insertions(+), 77 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index e7332c0c4..0f2364d9f 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,7 +730,7 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) unsigned long dgemm_prefetch_size_a; unsigned long dgemm_prefetch_size_b; unsigned long dgemm_prefetch_size_c; @@ -738,7 +738,7 @@ unsigned long dgemm_prefetch_size_c; void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) dgemm_p = 160; dgemm_q = 128; dgemm_r = 4096; diff --git a/interface/swap.c b/interface/swap.c index f7642edf1..17a9868a9 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 4c6d6fb71..7e7a900fb 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -1,8 +1,3 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c @@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c @@ -30,33 +20,35 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ifneq ($(OS_DARWIN)$(CROSS),11) -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S -endif +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S SROTKERNEL = rot.S DROTKERNEL = rot.S @@ -68,11 +60,6 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - SGEMVNKERNEL = gemv_n.S DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S @@ -83,18 +70,137 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S -STRMMKERNEL = ../generic/trmmkernel_4x4.c + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +ifneq ($(OS_DARWIN)$(CROSS),11) +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c +endif + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifneq ($(OS_DARWIN)$(CROSS),11) + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif + +else + +STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ifneq ($(OS_DARWIN)$(CROSS),11) -SGEMMKERNEL = sgemm_kernel_4x4.S -else SGEMMKERNEL = ../generic/gemmkernel_2x2.c -endif -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o @@ -116,26 +222,4 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - +endif diff --git a/param.h b/param.h index ded9fe0b8..c7952e113 100644 --- a/param.h +++ b/param.h @@ -2583,6 +2583,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(ARMV8) + +#if defined(OS_DARWIN) && defined(CROSS) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2590,13 +2592,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#if defined(OS_DARWIN) && defined(CROSS) #define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL N 2 -#else -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 -#endif +#define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 @@ -2622,10 +2619,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#define SYMV_P 16 +#else + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p + +#define SGEMM_DEFAULT_Q sgemm_q +#define DGEMM_DEFAULT_Q dgemm_q +#define CGEMM_DEFAULT_Q cgemm_q +#define ZGEMM_DEFAULT_Q zgemm_q + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r #define SYMV_P 16 #endif +#endif + #if defined(THUNDERX) #define SNUMOPT 2 #define DNUMOPT 2 From c7bbf9c987a0473aafbd8a4f48ed07cd52fccc38 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 11:13:29 +0300 Subject: [PATCH 247/935] Attempt to tame _hemv threading #1820 --- interface/zhemv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/interface/zhemv.c b/interface/zhemv.c index d1996ad69..8995ca1c2 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -195,7 +195,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + // see graph in issue #1820 for explanation and room for improvement + if (n<362) { + nthreads = 1 ; + } else { + nthreads = num_cpu_avail(2); + }; if (nthreads == 1) { #endif From a293bdcd5eaa610ed960264c4e1c48af662502e9 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 20 Oct 2018 21:37:53 +0300 Subject: [PATCH 248/935] re-arrange new code for readability --- interface/zhemv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/interface/zhemv.c b/interface/zhemv.c index 8995ca1c2..9c31f31d9 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -43,6 +43,10 @@ #include "functable.h" #endif +// this is smallest dimension N of square input a to permit threading +// see graph in issue #1820 for explanation +#define MULTI_THREAD_MINIMAL 362 + #ifdef XDOUBLE #define ERROR_NAME "XHEMV " #elif defined(DOUBLE) @@ -195,8 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - // see graph in issue #1820 for explanation and room for improvement - if (n<362) { + if (n Date: Thu, 18 Oct 2018 04:51:24 -0700 Subject: [PATCH 249/935] ARM64: Remove XGENE1 references Remove XGENE1 target as the implementation for the same is incomplete. Moreover whoever wishes to use on XGENE1 can use the generic ARMV8 target as there are no XGENE1 specific optimizations in OpenBLAS. --- kernel/arm64/KERNEL.XGENE1 | 135 ------------------------------------- 1 file changed, 135 deletions(-) delete mode 100644 kernel/arm64/KERNEL.XGENE1 diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 deleted file mode 100644 index d05754628..000000000 --- a/kernel/arm64/KERNEL.XGENE1 +++ /dev/null @@ -1,135 +0,0 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = sgemm_kernel_4x4.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - From d50abc8903089089357766d3ada7db090ff6e63d Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:02:23 -0700 Subject: [PATCH 250/935] ARM64: Move parameters from parameter.c to param.h Remove the runtime setting of P, Q, R parameters for targets ARMV8, THUNDERX2T99. Instead set them as constants in param.h at compile time. --- driver/others/parameter.c | 27 ----------- kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S | 10 ++-- param.h | 48 ++++++++++---------- 3 files changed, 27 insertions(+), 58 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 0f2364d9f..8bf7da78b 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,35 +730,8 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) -unsigned long dgemm_prefetch_size_a; -unsigned long dgemm_prefetch_size_b; -unsigned long dgemm_prefetch_size_c; -#endif - void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) - dgemm_p = 160; - dgemm_q = 128; - dgemm_r = 4096; - - sgemm_p = 128; - sgemm_q = 352; - sgemm_r = 4096; - - cgemm_p = 128; - cgemm_q = 224; - cgemm_r = 4096; - - zgemm_p = 128; - zgemm_q = 112; - zgemm_r = 4096; - - dgemm_prefetch_size_a = 3584; - dgemm_prefetch_size_b = 512; - dgemm_prefetch_size_c = 128; -#endif } #endif diff --git a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S index 598db6e0c..d1551ffea 100644 --- a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S @@ -943,13 +943,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] - - ldr A_PRE_SIZE, =dgemm_prefetch_size_a - ldr A_PRE_SIZE, [A_PRE_SIZE] - ldr B_PRE_SIZE, =dgemm_prefetch_size_b - ldr B_PRE_SIZE, [B_PRE_SIZE] - ldr C_PRE_SIZE, =dgemm_prefetch_size_c - ldr C_PRE_SIZE, [C_PRE_SIZE] + mov A_PRE_SIZE, #3584 + mov B_PRE_SIZE, #512 + mov C_PRE_SIZE, #128 add A_PRE_SIZE_64, A_PRE_SIZE, #64 add B_PRE_SIZE_64, B_PRE_SIZE, #64 diff --git a/param.h b/param.h index c7952e113..e4ec1b2b5 100644 --- a/param.h +++ b/param.h @@ -2641,20 +2641,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p -#define CGEMM_DEFAULT_P cgemm_p -#define ZGEMM_DEFAULT_P zgemm_p +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q sgemm_q -#define DGEMM_DEFAULT_Q dgemm_q -#define CGEMM_DEFAULT_Q cgemm_q -#define ZGEMM_DEFAULT_Q zgemm_q +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif @@ -2720,20 +2720,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p -#define CGEMM_DEFAULT_P cgemm_p -#define ZGEMM_DEFAULT_P zgemm_p +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q sgemm_q -#define DGEMM_DEFAULT_Q dgemm_q -#define CGEMM_DEFAULT_Q cgemm_q -#define ZGEMM_DEFAULT_Q zgemm_q +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif From e7b66cd36e12845701aaae979c29120439294368 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:13:02 -0700 Subject: [PATCH 251/935] ARM64: Fix DYNAMIC_ARCH compilation for cores which dont use GEMM3M --- kernel/Makefile | 4 ++ kernel/setparam-ref.c | 85 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index a0a8fcd21..923ffc363 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -88,7 +88,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h +ifeq ($(USE_GEMM3M), 1) + $(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@ +else $(CC) -c $(CFLAGS) $< -o $@ +endif setparam$(TSUFFIX).c : setparam-ref.c sed 's/TS/$(TSUFFIX)/g' $< > $(@F) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f654de110..e035d5bda 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -294,6 +294,8 @@ gotoblas_t TABLE_NAME = { chemm_outcopyTS, chemm_oltcopyTS, 0, 0, 0, + +#if defined(USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), #else @@ -324,6 +326,33 @@ gotoblas_t TABLE_NAME = { chemm3m_oucopybTS, chemm3m_olcopybTS, chemm3m_oucopyrTS, chemm3m_olcopyrTS, chemm3m_oucopyiTS, chemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK cneg_tcopyTS, claswp_ncopyTS, @@ -400,6 +429,7 @@ gotoblas_t TABLE_NAME = { zhemm_outcopyTS, zhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) #ifdef ZGEMM3M_DEFAULT_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), #else @@ -430,6 +460,33 @@ gotoblas_t TABLE_NAME = { zhemm3m_oucopybTS, zhemm3m_olcopybTS, zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK zneg_tcopyTS, zlaswp_ncopyTS, @@ -503,6 +560,7 @@ gotoblas_t TABLE_NAME = { xhemm_outcopyTS, xhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), xgemm3m_kernelTS, @@ -528,6 +586,33 @@ gotoblas_t TABLE_NAME = { xhemm3m_oucopybTS, xhemm3m_olcopybTS, xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK xneg_tcopyTS, xlaswp_ncopyTS, From af2837c392344c54e03e517902ae4fa4983570c0 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 22 Oct 2018 01:49:16 -0700 Subject: [PATCH 252/935] ARM64: Remove #define ARMV8 for THUNDERX --- cpuid_arm64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a42346c88..17078fe7f 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -237,7 +237,6 @@ void get_cpuconfig(void) break; case CPU_THUNDERX: - printf("#define ARMV8\n"); printf("#define THUNDERX\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 128\n"); From d5aeff636f2d8ba99d1e5ed511c3770970f440af Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 18 Oct 2018 05:15:45 -0700 Subject: [PATCH 253/935] ARM64: Enable DYNAMIC_ARCH Enable DYNAMIC_ARCH feature on ARM64. This patch uses the cpuid feature in linux kernel to detect the core type at runtime (https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt). If this feature is missing in kernel, then the user should use the OPENBLAS_CORETYPE env variable to select the desired core type. --- Makefile.system | 7 ++ driver/others/Makefile | 8 ++ driver/others/dynamic_arm64.c | 198 +++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.ARMV8 | 48 ++++---- kernel/arm64/KERNEL.CORTEXA57 | 32 ++--- kernel/arm64/KERNEL.THUNDERX | 16 +-- kernel/arm64/KERNEL.THUNDERX2T99 | 32 ++--- kernel/setparam-ref.c | 73 ++++++++++++ 8 files changed, 350 insertions(+), 64 deletions(-) create mode 100644 driver/others/dynamic_arm64.c diff --git a/Makefile.system b/Makefile.system index b4cd4222a..7847c7525 100644 --- a/Makefile.system +++ b/Makefile.system @@ -510,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT) #CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' endif +ifeq ($(ARCH), arm64) +DYNAMIC_CORE = ARMV8 +DYNAMIC_CORE += CORTEXA57 +DYNAMIC_CORE += THUNDERX +DYNAMIC_CORE += THUNDERX2T99 +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= diff --git a/driver/others/Makefile b/driver/others/Makefile index e61ba7bc8..3dc2e7c1b 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -15,7 +15,11 @@ endif # COMMONOBJS += info.$(SUFFIX) ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +COMMONOBJS += dynamic_arm64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c endif ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c new file mode 100644 index 000000000..b4ce6b67d --- /dev/null +++ b/driver/others/dynamic_arm64.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include +#include + +extern gotoblas_t gotoblas_ARMV8; +extern gotoblas_t gotoblas_CORTEXA57; +extern gotoblas_t gotoblas_THUNDERX; +extern gotoblas_t gotoblas_THUNDERX2T99; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 4 + +/* + * In case asm/hwcap.h is outdated on the build system, make sure + * that HWCAP_CPUID is defined + */ +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif + +#define get_cpu_ftr(id, var) ({ \ + asm("mrs %0, "#id : "=r" (var)); \ + }) + +static char *corename[] = { + "armv8", + "cortexa57", + "thunderx", + "thunderx2t99", + "unknown" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; + if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; + if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; + if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i ; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_ARMV8); + case 1: return (&gotoblas_CORTEXA57); + case 2: return (&gotoblas_THUNDERX); + case 3: return (&gotoblas_THUNDERX2T99); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int implementer, variant, part, arch, revision, midr_el1; + + if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { + char coremsg[128]; + snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); + openblas_warning(1, coremsg); + return NULL; + } + + get_cpu_ftr(MIDR_EL1, midr_el1); + /* + * MIDR_EL1 + * + * 31 24 23 20 19 16 15 4 3 0 + * ----------------------------------------------------------------- + * | Implementer | Variant | Architecture | Part Number | Revision | + * ----------------------------------------------------------------- + */ + implementer = (midr_el1 >> 24) & 0xFF; + part = (midr_el1 >> 4) & 0xFFF; + + switch(implementer) + { + case 0x41: // ARM + switch (part) + { + case 0xd07: // Cortex A57 + case 0xd08: // Cortex A72 + case 0xd03: // Cortex A53 + return &gotoblas_CORTEXA57; + } + break; + case 0x42: // Broadcom + switch (part) + { + case 0x516: // Vulcan + return &gotoblas_THUNDERX2T99; + } + break; + case 0x43: // Cavium + switch (part) + { + case 0x0a1: // ThunderX + return &gotoblas_THUNDERX; + case 0x0af: // ThunderX2 + return &gotoblas_THUNDERX2T99; + } + break; + } + return NULL; +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_ARMV8; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 7e7a900fb..bcecd0026 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -113,13 +113,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -134,8 +134,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -146,34 +146,34 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S @@ -201,25 +201,25 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 2fd2c3d87..04d6940d7 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -111,13 +111,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -132,8 +132,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -144,32 +144,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index e19655e8c..cb02c7bc5 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -89,26 +89,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = sgemm_kernel_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a73d4cee8..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -74,13 +74,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -94,8 +94,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -106,32 +106,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index e035d5bda..6d4028b0b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -646,6 +646,78 @@ gotoblas_t TABLE_NAME = { }; +#if defined(ARCH_ARM64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif + +} +#else // defined(ARCH_ARM64) #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1231,3 +1303,4 @@ static void init_parameter(void) { } +#endif //defined(ARCH_ARM64) From 2992e3886aa6304ac2715890f4fbd8548e891c53 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Mon, 22 Oct 2018 23:21:49 +0300 Subject: [PATCH 254/935] disable threading in C/ZSWAP copying from S/DSWAP --- interface/zswap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/interface/zswap.c b/interface/zswap.c index e33bbafba..372b15447 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,6 +42,14 @@ #include "functable.h" #endif +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +// Multithreaded swap gives performance benefits in ThunderX2T99 +#else +// Disable multi-threading as it does not show any performance +// benefits. Keep the multi-threading code for the record. +#undef SMP +#endif + #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -81,7 +89,7 @@ FLOAT *y = (FLOAT*)vy; #ifdef SMP //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) + if (incx == 0 || incy == 0 || n < 1048576 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) nthreads = 1; else nthreads = num_cpu_avail(1); From 2d8064174c444bb377cc2e3879a9c8e76e45b314 Mon Sep 17 00:00:00 2001 From: fengrl <42458138+fengrl@users.noreply.github.com> Date: Fri, 26 Oct 2018 17:55:15 +0800 Subject: [PATCH 255/935] register push/pop command change 64bit push/pop register command should be used. Otherwise, data will lost. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 36 ++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 37b20a880..82703ff5d 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -146,11 +146,11 @@ sd $21, 40($sp) sd $22, 48($sp) - ST $f24, 56($sp) - ST $f25, 64($sp) - ST $f26, 72($sp) - ST $f27, 80($sp) - ST $f28, 88($sp) + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) @@ -161,10 +161,10 @@ #endif #ifndef __64BIT__ - ST $f20,120($sp) - ST $f21,128($sp) - ST $f22,136($sp) - ST $f23,144($sp) + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) #endif .align 4 @@ -7766,11 +7766,11 @@ ld $21, 40($sp) ld $22, 48($sp) - LD $f24, 56($sp) - LD $f25, 64($sp) - LD $f26, 72($sp) - LD $f27, 80($sp) - LD $f28, 88($sp) + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) @@ -7779,10 +7779,10 @@ #endif #ifndef __64BIT__ - LD $f20,120($sp) - LD $f21,128($sp) - LD $f22,136($sp) - LD $f23,144($sp) + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE From 64ca44873bd9d960c63456a43fd565c56514e895 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Oct 2018 18:36:55 +0100 Subject: [PATCH 256/935] Fix detection of Ryzen2 (missing CORE_ZEN) --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 512ad877b..8e4a7cb84 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2009,6 +2009,8 @@ int get_coretype(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // Ryzen 2 if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; From 38cf5d93647bf5ffb5fe3e17447eba0c157bb305 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Sun, 28 Oct 2018 21:16:52 +0000 Subject: [PATCH 257/935] ensure that threading has been initialized in the first place before calling openblas_set_num_threads --- driver/others/blas_server.c | 5 +++++ driver/others/blas_server_win32.c | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6a25e2d07..e5db1804f 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) { long i; +#ifdef SMP_SERVER + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + if (num_threads < 1) num_threads = blas_num_threads; #ifndef NO_AFFINITY diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 02a25ac39..bae344c59 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){ void goto_set_num_threads(int num_threads) { - long i; + long i; + +#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif if (num_threads < 1) num_threads = blas_cpu_number; From 326d394a0fbcc8226bb958f523ca1005696c33b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Oct 2018 18:38:22 +0100 Subject: [PATCH 258/935] Add get_num_procs implementation for AIX (and copy HAIKU implementation to the non-TLS version of the code as well) --- driver/others/memory.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 4a8e6c067..25f198623 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -259,6 +259,16 @@ int get_num_procs(void) { } #endif +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + + + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -1738,6 +1748,22 @@ int get_num_procs(void) { return nums; } #endif + +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif #ifdef OS_WINDOWS From 7b5aea52bb105c15d7e80e0749b80f6bfb0566b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Oct 2018 21:50:34 +0100 Subject: [PATCH 259/935] Accomodate AIX install, which has different syntax for #1803 --- Makefile.install | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Makefile.install b/Makefile.install index fa657beba..7aa477cf0 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,6 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif +ifneq (($OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif + ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif +else +#install on AIX has different options syntax +ifndef NO_LAPACKE + @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" +endif + +#for install static library +ifndef NO_STATIC + @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @install -M 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +endif +#for install shared library +ifndef NO_SHARED + @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @install -M 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) +endif + +endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" From dcc5d6291e7b02761acfb6161c04ba1f8f25b502 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:42:09 +0000 Subject: [PATCH 260/935] skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case in the threading code there are cases where N or M can become 0, and the optimized beta code did not handle this well, leading to a crash during the audit for the crash a few edge conditions on the if statements were found and fixed as well --- kernel/x86_64/dgemm_beta_skylakex.c | 6 ++++-- kernel/x86_64/sgemm_beta_skylakex.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 384e9f60b..6a824c9b5 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (m == 0 || n == 0) + return 0; c_offset = c; @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm512_storeu_pd(c_offset1, z_zero); c_offset1 += 8; i -= 8; diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 54f9664e9..4e40acadf 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (n == 0 || m == 0) + return; c_offset = c; @@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; From 5b708e5eb1b17af9c45e0da2993da8a4756cb912 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:43:20 +0000 Subject: [PATCH 261/935] sgemm/dgemm: add a way for an arch kernel to specify prefered sizes The current gemm threading code can make very unfortunate choices, for example on my 10 core system a 1024x1024x1024 matrix multiply ends up chunking into blocks of 102... which is not a vector friendly size and performance ends up horrible. this patch adds a helper define where an architecture can specify a preference for size multiples. This is different from existing defines that are minimum sizes and such. The performance increase with this patch for the 1024x1024x1024 sgemm is 2.3x (!!) --- driver/level3/level3_thread.c | 22 ++++++++++++++++++++++ param.h | 1 + 2 files changed, 23 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..de29247d4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -48,6 +48,10 @@ #define SWITCH_RATIO 2 #endif +#ifndef GEMM_PREFERED_SIZE +#define GEMM_PREFERED_SIZE 1 +#endif + //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, return 0; } +static int round_up(int remainder, int width, int multiple) +{ + if (multiple > remainder || width <= multiple) + return width; + width = (width + multiple - 1) / multiple; + width = width * multiple; + return width; +} + + static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { @@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); + + width = round_up(m, width, GEMM_PREFERED_SIZE); + m -= width; + if (m < 0) width = width + m; range_M[num_parts + 1] = range_M[num_parts] + width; + num_parts ++; } for (i = num_parts; i < MAX_CPU_NUMBER; i++) { @@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < SWITCH_RATIO) { width = SWITCH_RATIO; } + width = round_up(n, width, GEMM_PREFERED_SIZE); + n -= width; if (n < 0) width = width + n; range_N[num_parts + 1] = range_N[num_parts] + width; + num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { diff --git a/param.h b/param.h index e4ec1b2b5..d1b211584 100644 --- a/param.h +++ b/param.h @@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 32 #ifdef ARCH_X86 From b0255231979ac40444fea06bc8958731fdcdef7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Nov 2018 18:26:08 +0100 Subject: [PATCH 262/935] Use installbsd on AIX (and fix misplaced parenthesis from previous commit). See #1803 --- Makefile.install | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile.install b/Makefile.install index 7aa477cf0..069c96c6a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,7 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif -ifneq (($OSNAME), AIX) +ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -99,23 +99,23 @@ else #install on AIX has different options syntax ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" - @-install -M 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -M 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -M 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) From 9c177d270b7ae78c4542a15ec02d8cab9cc7f367 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Nov 2018 18:50:25 +0100 Subject: [PATCH 263/935] Restore Android/ARMv7 build fix from #778 for #1811 --- lapack-netlib/LAPACKE/include/lapacke_config.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 1e2509bf0..8262c3488 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -34,6 +34,13 @@ #ifndef _LAPACKE_CONFIG_H_ #define _LAPACKE_CONFIG_H_ +// For Android prior to API 21 (no include) +#if defined(__ANDROID__) +#if __ANDROID_API__ < 21 +#define LAPACK_COMPLEX_STRUCTURE +#endif +#endif + #ifdef __cplusplus #if defined(LAPACK_COMPLEX_CPP) #include From fb5b2177ca794f81f85530f223dd630e147092ca Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Mon, 5 Nov 2018 11:30:12 +0000 Subject: [PATCH 264/935] [Arm64) Revert A53 detection as A57 This patch reverts the decision of treating A53 like A57, which was based on an analysis done on server class hardware and is not representative of all A53s out there. Fixes #1855. --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 17078fe7f..3acb395b5 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -115,8 +115,8 @@ int detect(void) fclose(infile); if(cpu_part != NULL && cpu_implementer != NULL) { if (strstr(cpu_implementer, "0x41") && - (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") )) - return CPU_CORTEXA57; //or compatible A53, A72 + (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08"))) + return CPU_CORTEXA57; //or compatible, ex. A72 else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) return CPU_VULCAN; else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) From 7d3502b5003ad54903b7a9e9aec5a853dfbe0221 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 08:20:55 +0000 Subject: [PATCH 266/935] Add -frecursive gfortran option by default --- Makefile.rule | 4 ++-- Makefile.system | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 6522b0777..d97607f2e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -192,8 +192,8 @@ NO_AFFINITY = 1 # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # COMMON_OPT = -O2 -# gfortran option for LAPACK -# enable this flag only on 64bit Linux and if you need a thread safe lapack library +# gfortran option for LAPACK to improve thread-safety +# It is enabled by default in Makefile.system for gfortran # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # FCOMMON_OPT = -frecursive diff --git a/Makefile.system b/Makefile.system index b4cd4222a..8de0b8f6e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -718,6 +718,8 @@ endif ifeq ($(F_COMPILER), GFORTRAN) CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall +# make single-threaded LAPACK calls thread-safe #1847 +FCOMMON_OPT += -frecursive #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran From 3fd41313fc2c36ea55a5e3aaf02cf2734f2d18c5 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 09:40:13 +0000 Subject: [PATCH 268/935] add low bound for number of buffers --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 6c3d5b15e..60da2416a 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ From 40cce0e353ca21ed1d045b4fc58faddd2ff6c2a7 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 09:45:49 +0000 Subject: [PATCH 269/935] handle cmake too --- cmake/fc.cmake | 2 +- common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 38d59f956..adec28a91 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -44,7 +44,7 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") diff --git a/common.h b/common.h index 6c3d5b15e..60da2416a 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ From 9531d0e1757dc0edd64c5c439d65fb236195410a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 6 Nov 2018 17:51:24 +0000 Subject: [PATCH 270/935] lets fit it in one 4k page --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 60da2416a..7fcd5e316 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) +#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ From cfb0f5b0f82e67cf3cc854c8319ddb79ecd1366c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Nov 2018 22:39:10 +0100 Subject: [PATCH 271/935] Set LIBSONAME suffix to .a for AIX another fix for #1803 --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 7847c7525..716bd18e2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1211,7 +1211,11 @@ endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a +ifneq ($(OSNAME), AIX) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) +else +LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) +endif LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) From 0427277ceff6e477e06d98abe03e0b2348d6d26a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Nov 2018 15:45:54 +0100 Subject: [PATCH 272/935] Allow optimization for small m, large n only if it can be made threadsafe otherwise the introduction of a static array in 8e5a108 to improve #532 breaks concurrent calls from multiple threads as seen in #1844 --- driver/level2/gemv_thread.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index 061454848..fc4e4f7fe 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -62,9 +62,36 @@ #endif #endif -#ifndef TRANSA +#ifndef thread_local +# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__ +# define thread_local _Thread_local +# elif defined _WIN32 && ( \ + defined _MSC_VER || \ + defined __ICL || \ + defined __DMC__ || \ + defined __BORLANDC__ ) +# define thread_local __declspec(thread) +/* note that ICC (linux) and Clang are covered by __GNUC__ */ +# elif defined __GNUC__ || \ + defined __SUNPRO_C || \ + defined __xlC__ +# define thread_local __thread +# else +# define UNSAFE +#endif +#endif +#if defined USE_OPENMP +#undef UNSAFE +#endif + +#if !defined(TRANSA) && !defined(UNSAFE) #define Y_DUMMY_NUM 1024 +#if defined(USE_OPENMP) static FLOAT y_dummy[Y_DUMMY_NUM]; +#pragma omp threadprivate(y_dummy) +# else +static thread_local FLOAT y_dummy[Y_DUMMY_NUM]; +# endif #endif static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ @@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifdef TRANSA y += n_from * incy * COMPSIZE; #else +# ifndef UNSAFE //for split matrix row (n) direction and vector x of gemv_n x += n_from * incx * COMPSIZE; //store partial result for every thread y += (m_to - m_from) * 1 * COMPSIZE * pos; +# endif #endif } @@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x BLASLONG width, i, num_cpu; -#ifndef TRANSA +#if !defined(TRANSA) && !defined(iUNSAFE) int split_x=0; #endif @@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x i -= width; } -#ifndef TRANSA +#if !defined(TRANSA) && !defined(UNSAFE) //try to split matrix on row direction and x. //Then, reduction. if (num_cpu < nthreads) { @@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x exec_blas(num_cpu, queue); } -#ifndef TRANSA +#if !defined(TRANSA) && !defined(UNSAFE) if(split_x==1){ //reduction for(i=0; i Date: Sat, 10 Nov 2018 17:16:53 +0100 Subject: [PATCH 273/935] Fix argument in SLASET call to zero S fixes #1859 in accordance with https://github.com/LAPACK-Reference/issue/296 --- lapack-netlib/SRC/sgelss.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index 29380d4dc..84a882d2e 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -407,7 +407,7 @@ * Matrix all zero. Return zero solution. * CALL SLASET( 'F', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - CALL SLASET( 'F', MINMN, 1, ZERO, ZERO, S, 1 ) + CALL SLASET( 'F', MINMN, 1, ZERO, ZERO, S, MINMN ) RANK = 0 GO TO 70 END IF From e3666931d8b54f0bf918e45bc3da6ce51ea2a52a Mon Sep 17 00:00:00 2001 From: Arda Aytekin Date: Fri, 9 Nov 2018 00:25:30 +0100 Subject: [PATCH 274/935] Update .travis.yml Updated `.travis.yml` file to add emulated tests for `ARMV6` and `ARMV8` architectures with `gcc` and `clang`. Created prebuilt images with required dependencies. Squashed layers into one. --- .travis.yml | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index a0af0472e..4efa23b8d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,11 +4,10 @@ dist: precise sudo: true language: c -jobs: +matrix: include: - &test-ubuntu os: linux - stage: test compiler: gcc addons: apt: @@ -59,7 +58,6 @@ jobs: - BTYPE="BINARY=32" - os: linux - stage: test compiler: gcc addons: apt: @@ -80,7 +78,6 @@ jobs: # that don't require sudo. - &test-alpine os: linux - stage: test dist: trusty sudo: true language: minimal @@ -124,7 +121,6 @@ jobs: - &test-cmake os: linux - stage: test compiler: clang addons: apt: @@ -153,7 +149,6 @@ jobs: - &test-macos os: osx - stage: test osx_image: xcode8 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" @@ -168,6 +163,42 @@ jobs: env: - BTYPE="BINARY=32" + - &emulated-arm + dist: trusty + sudo: required + services: docker + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + name: "Emulated Build for ARMV6 with gcc" + before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset + script: | + echo "FROM openblas/alpine:${IMAGE_ARCH} + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=${TARGET_ARCH} \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + - <<: *emulated-arm + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + name: "Emulated Build for ARMV6 with clang" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + name: "Emulated Build for ARMV8 with gcc" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + name: "Emulated Build for ARMV8 with clang" + + allow_failures: + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + # whitelist branches: only: From 807f6e6922d7b7c53f79171e5224d11368c28235 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Nov 2018 18:52:29 +0100 Subject: [PATCH 275/935] Use prtconf to determine CPU type on AIX for #1803 --- cpuid_power.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cpuid_power.c b/cpuid_power.c index 6c7baef4a..ebd9e151e 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -127,6 +127,33 @@ int detect(void){ #endif #ifdef _AIX + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = popen("prtconf|grep 'Processor Type'"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("Pro", buffer, 3)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + pclose(infile); + + if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; + if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; + if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; + if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; + if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; + if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; + return CPUTYPE_POWER5; #endif From 2f04cf22accecc0befcc00fbb77dfc76e0506c84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Nov 2018 08:16:14 +0100 Subject: [PATCH 276/935] Detect POWER9 as POWER8 on AIX and Linux (already supported by the *BSD version) --- cpuid_power.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index ebd9e151e..afc94d2d5 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -56,6 +56,7 @@ #define CPUTYPE_CELL 6 #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 +#define CPUTYPE_POWER9 9 char *cpuname[] = { "UNKNOWN", @@ -66,7 +67,8 @@ char *cpuname[] = { "POWER6", "CELL", "PPCG4", - "POWER8" + "POWER8", + "POWER9" }; char *lowercpuname[] = { @@ -78,7 +80,8 @@ char *lowercpuname[] = { "power6", "cell", "ppcg4", - "power8" + "power8", + "power9" }; char *corename[] = { @@ -90,7 +93,8 @@ char *corename[] = { "POWER6", "CELL", "PPCG4", - "POWER8" + "POWER8", + "POWER8" }; int detect(void){ @@ -120,6 +124,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -151,9 +156,9 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; - return CPUTYPE_POWER5; #endif From c171b8ad13054518869cdc54db5af5cf6b886089 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Nov 2018 13:57:18 +0100 Subject: [PATCH 277/935] Handle special case INCX=0,INCY=0 in the axpy interface --- interface/axpy.c | 5 +++++ interface/zaxpy.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/interface/axpy.c b/interface/axpy.c index 39edea6af..9032946d2 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc if (alpha == ZERO) return; + if (incx == 0 && incy == 0) { + *y += n * alpha *(*x); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 1a0259c96..dbd559628 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -82,6 +82,12 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + if (incx == 0 && incy == 0) { + *y += n * (alpha_r * (*x) - alpha_i* (*(x+1)) ); + *(y+1) += n * (alpha_i * (*x) + alpha_r * (*(x +1)) ); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From 43bb386b10d94b341d5c8a27b5634081bb87de7f Mon Sep 17 00:00:00 2001 From: fengruilin Date: Thu, 15 Nov 2018 11:11:59 +0800 Subject: [PATCH 278/935] fix dot problem on 64bit mips --- kernel/mips64/KERNEL | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index e257dcfc9..3804b245d 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -6,6 +6,11 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S From 42bc2a92023070ee871ffd81b6a9b8fb6dd1892b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Nov 2018 12:10:44 +0100 Subject: [PATCH 279/935] Fix copy-paste errors (POWER8/9 and extraneous return) --- cpuid_power.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index afc94d2d5..fc36f8e2c 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -156,7 +156,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; @@ -180,7 +180,7 @@ int id; id = __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 - return return CPUTYPE_POWER8; + return CPUTYPE_POWER8; break; case 0x4d: case 0x4b: // POWER8/8E From 368d14f8c8b2eb2916d7cd6765f40c5aa31e2184 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Nov 2018 14:58:28 +0100 Subject: [PATCH 280/935] Fix harmless typo fixes #1872 --- driver/level2/gemv_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index fc4e4f7fe..d57740314 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x BLASLONG width, i, num_cpu; -#if !defined(TRANSA) && !defined(iUNSAFE) +#if !defined(TRANSA) && !defined(UNSAFE) int split_x=0; #endif From 2e6fae2aad240fe6be8273cc53bc239ee920ee7c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Nov 2018 14:02:50 +0100 Subject: [PATCH 281/935] Serialize accesses to parallelized level3 functions from multiple callers for #1851 --- driver/level3/level3_thread.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..15cad9274 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -514,6 +514,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { +#ifndef USE_OPENMP +static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#endif + blas_arg_t newarg; #ifndef USE_ALLOC_HEAP @@ -554,6 +558,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #endif #endif +#ifndef USE_OPENMP +pthread_mutex_lock(&level3_lock); +#endif + #ifdef USE_ALLOC_HEAP /* Dynamically allocate workspace */ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); @@ -671,6 +679,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG free(job); #endif +#ifndef USE_OPENMP + pthread_mutex_unlock(&level3_lock); +#endif + return 0; } From 310ea55f29f16771438386fb2f1f140e2fd7e397 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Fri, 16 Nov 2018 15:45:12 +0000 Subject: [PATCH 282/935] Simplifying ARMv8 build parameters ARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode (which is not right because TX2 is ARMv8.1) as well as requiring a few redundancies in the defines, making it harder to maintain and understand what core has what. A few other minor issues were also fixed. Tests were made on the following cores: A53, A57, A72, Falkor, ThunderX, ThunderX2, and XGene. Tests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester. A summary: * Removed TX2 code from ARMv8 build, to make sure it is compatible with all ARMv8 cores, not just v8.1. Also, the TX2 code has actually harmed performance on big cores. * Commoned up ARMv8 architectures' defines in params.h, to make sure that all will benefit from ARMv8 settings, in addition to their own. * Adding a few more cores, using ARMv8's include strategy, to benefit from compiler optimisations using mtune. Also updated cache information from the manuals, making sure we set good conservative values by default. Removed Vulcan, as it's an alias to TX2. * Auto-detecting most of those cores, but also updating the forced compilation in getarch.c, to make sure the parameters are the same whether compiled natively or forced arch. Benefits: * ARMv8 build is now guaranteed to work on all ARMv8 cores * Improved performance for ARMv8 builds on some cores (A72, Falkor, ThunderX1 and 2: up to 11%) over current develop * Improved performance for *all* cores comparing to develop branch before TX2's patch (9% ~ 36%) * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than current develop's branch and 8% faster than deveop before tx2 patches Issues: * Regression from current develop branch for A53 (-12%) and A57 (-3%) with ARMv8 builds, but still faster than before TX2's commit (+15% and +24% respectively). This can be improved with a simplification of TX2's code, to be done in future patches. At least the code is guaranteed to be ARMv8.0 now. Comments: * CortexA57 builds are unchanged on A57 hardware from develop's branch, which makes sense, as it's untouched. * CortexA72 builds improve over A57 on A72 hardware, even if they're using the same includes due to new compiler tunning in the makefile. --- Makefile.arm64 | 33 ++++++--- TargetList.txt | 5 +- cpuid_arm64.c | 126 +++++++++++++++++++--------------- getarch.c | 78 +++++++++++++++++---- kernel/arm64/KERNEL.ARMV8 | 68 +++++++----------- kernel/arm64/KERNEL.CORTEXA53 | 3 + kernel/arm64/KERNEL.CORTEXA72 | 3 + kernel/arm64/KERNEL.CORTEXA73 | 3 + kernel/arm64/KERNEL.FALKOR | 3 + kernel/arm64/KERNEL.VULCAN | 3 - param.h | 124 +++++++++++++-------------------- 11 files changed, 249 insertions(+), 200 deletions(-) create mode 100644 kernel/arm64/KERNEL.CORTEXA53 create mode 100644 kernel/arm64/KERNEL.CORTEXA72 create mode 100644 kernel/arm64/KERNEL.CORTEXA73 create mode 100644 kernel/arm64/KERNEL.FALKOR delete mode 100644 kernel/arm64/KERNEL.VULCAN diff --git a/Makefile.arm64 b/Makefile.arm64 index d19e796a5..a529fab80 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a endif +ifeq ($(CORE), CORTEXA53) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +endif + ifeq ($(CORE), CORTEXA57) -CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 -FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +endif + +ifeq ($(CORE), CORTEXA72) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif -ifeq ($(CORE), VULCAN) -CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan -FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan +ifeq ($(CORE), CORTEXA73) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif ifeq ($(CORE), THUNDERX) -CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx -FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx +CCOMMON_OPT += -march=armv8-a -mtune=thunderx +FCOMMON_OPT += -march=armv8-a -mtune=thunderx +endif + +ifeq ($(CORE), FALKOR) +CCOMMON_OPT += -march=armv8.1-a -mtune=falkor +FCOMMON_OPT += -march=armv8.1-a -mtune=falkor endif ifeq ($(CORE), THUNDERX2T99) -CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 -FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif diff --git a/TargetList.txt b/TargetList.txt index 31e4881c4..3d04a57cf 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -83,8 +83,11 @@ ARMV5 8.ARM 64-bit CPU: ARMV8 +CORTEXA53 CORTEXA57 -VULCAN +CORTEXA72 +CORTEXA73 +FALKOR THUNDERX THUNDERX2T99 diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 3acb395b5..c914fbc2b 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -29,25 +29,37 @@ #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 -#define CPU_CORTEXA57 2 -#define CPU_VULCAN 3 -#define CPU_THUNDERX 4 -#define CPU_THUNDERX2T99 5 +// Arm +#define CPU_CORTEXA53 2 +#define CPU_CORTEXA57 3 +#define CPU_CORTEXA72 4 +#define CPU_CORTEXA73 5 +// Qualcomm +#define CPU_FALKOR 6 +// Cavium +#define CPU_THUNDERX 7 +#define CPU_THUNDERX2T99 8 static char *cpuname[] = { "UNKNOWN", "ARMV8" , + "CORTEXA53", "CORTEXA57", - "VULCAN", + "CORTEXA72", + "CORTEXA73", + "FALKOR", "THUNDERX", "THUNDERX2T99" }; static char *cpuname_lower[] = { "unknown", - "armv8" , + "armv8", + "cortexa53", "cortexa57", - "vulcan", + "cortexa72", + "cortexa73", + "falkor", "thunderx", "thunderx2t99" }; @@ -114,14 +126,24 @@ int detect(void) fclose(infile); if(cpu_part != NULL && cpu_implementer != NULL) { - if (strstr(cpu_implementer, "0x41") && - (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08"))) - return CPU_CORTEXA57; //or compatible, ex. A72 - else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) - return CPU_VULCAN; - else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) + // Arm + if (strstr(cpu_implementer, "0x41")) { + if (strstr(cpu_part, "0xd03")) + return CPU_CORTEXA53; + else if (strstr(cpu_part, "0xd07")) + return CPU_CORTEXA57; + else if (strstr(cpu_part, "0xd08")) + return CPU_CORTEXA72; + else if (strstr(cpu_part, "0xd09")) + return CPU_CORTEXA73; + } + // Qualcomm + else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) + return CPU_FALKOR; + // Cavium + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1")) return CPU_THUNDERX; - else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43")) + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; } @@ -180,62 +202,62 @@ void get_subdirname(void) void get_cpuconfig(void) { + // All arches should define ARMv8 + printf("#define ARMV8\n"); + printf("#define HAVE_NEON\n"); // This shouldn't be necessary + printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary + int d = detect(); switch (d) { + case CPU_CORTEXA53: + printf("#define %s\n", cpuname[d]); + // Fall-through case CPU_ARMV8: - printf("#define ARMV8\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); - break; - - case CPU_VULCAN: - printf("#define VULCAN \n"); - printf("#define HAVE_VFP \n"); - printf("#define HAVE_VFPV3 \n"); - printf("#define HAVE_NEON \n"); - printf("#define HAVE_VFPV4 \n"); - printf("#define L1_CODE_SIZE 32768 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 262144 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 33554432 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); + // Minimum parameters for ARMv8 (based on A53) + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); break; case CPU_CORTEXA57: - printf("#define CORTEXA57\n"); - printf("#define HAVE_VFP\n"); - printf("#define HAVE_VFPV3\n"); - printf("#define HAVE_NEON\n"); - printf("#define HAVE_VFPV4\n"); + case CPU_CORTEXA72: + case CPU_CORTEXA73: + // Common minimum settings for these Arm cores + // Can change a lot, but we need to be conservative + // TODO: detect info from /sys if possible + printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 49152\n"); printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_ASSOCIATIVE 3\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_ASSOCIATIVE 2\n"); - printf("#define L2_SIZE 2097152\n"); + printf("#define L2_SIZE 524288\n"); printf("#define L2_LINESIZE 64\n"); printf("#define L2_ASSOCIATIVE 16\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_FALKOR: + printf("#define FALKOR\n"); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; + case CPU_THUNDERX: printf("#define THUNDERX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -249,10 +271,6 @@ void get_cpuconfig(void) case CPU_THUNDERX2T99: printf("#define VULCAN \n"); - printf("#define HAVE_VFP \n"); - printf("#define HAVE_VFPV3 \n"); - printf("#define HAVE_NEON \n"); - printf("#define HAVE_VFPV4 \n"); printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n"); diff --git a/getarch.c b/getarch.c index 31f41d62c..146f1f36f 100644 --- a/getarch.c +++ b/getarch.c @@ -927,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "armv8" #define CORENAME "ARMV8" #endif +#ifdef FORCE_CORTEXA53 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA53" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA53 " \ + "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa53" +#define CORENAME "CORTEXA53" +#else +#endif + #ifdef FORCE_CORTEXA57 #define FORCE #define ARCHITECTURE "ARM64" @@ -942,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa57" #define CORENAME "CORTEXA57" #else #endif -#ifdef FORCE_VULCAN +#ifdef FORCE_CORTEXA72 #define FORCE #define ARCHITECTURE "ARM64" -#define SUBARCHITECTURE "VULCAN" +#define SUBARCHITECTURE "CORTEXA72" #define SUBDIRNAME "arm64" -#define ARCHCONFIG "-DVULCAN " \ - "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ - "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ - "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ - "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ +#define ARCHCONFIG "-DCORTEXA72 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa72" +#define CORENAME "CORTEXA72" +#else +#endif + +#ifdef FORCE_CORTEXA73 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA73" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA73 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa73" +#define CORENAME "CORTEXA73" +#else +#endif + +#ifdef FORCE_FALKOR +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "FALKOR" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DFALKOR " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" -#define LIBNAME "vulcan" -#define CORENAME "VULCAN" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "falkor" +#define CORENAME "FALKOR" #else #endif @@ -973,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DTHUNDERX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx" #define CORENAME "THUNDERX" #else #endif #ifdef FORCE_THUNDERX2T99 +#define ARMV8 #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "THUNDERX2T99" @@ -990,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx2t99" #define CORENAME "THUNDERX2T99" #else diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index bcecd0026..5c70390dc 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -46,7 +46,7 @@ CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S +DAXPYKERNEL = axpy.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S @@ -71,39 +71,37 @@ CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S ifneq ($(OS_DARWIN)$(CROSS),11) -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +CNRM2KERNEL = nrm2.S +DNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S endif -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c +DDOTKERNEL = dot.S +SDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S ifneq ($(OS_DARWIN)$(CROSS),11) @@ -175,22 +173,6 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) -DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) -SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S -endif - -ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) -CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) -ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S -endif - else STRMMKERNEL = ../generic/trmmkernel_2x2.c diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 new file mode 100644 index 000000000..c1d33fa3e --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.ARMV8 + + diff --git a/kernel/arm64/KERNEL.CORTEXA72 b/kernel/arm64/KERNEL.CORTEXA72 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA72 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.CORTEXA73 b/kernel/arm64/KERNEL.CORTEXA73 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA73 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.FALKOR b/kernel/arm64/KERNEL.FALKOR new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.FALKOR @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.VULCAN b/kernel/arm64/KERNEL.VULCAN deleted file mode 100644 index 8b0273951..000000000 --- a/kernel/arm64/KERNEL.VULCAN +++ /dev/null @@ -1,3 +0,0 @@ -include $(KERNELDIR)/KERNEL.THUNDERX2T99 - - diff --git a/param.h b/param.h index d1b211584..8f56cdaaa 100644 --- a/param.h +++ b/param.h @@ -2543,8 +2543,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +// Common ARMv8 parameters +#if defined(ARMV8) -#if defined(CORTEXA57) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2552,46 +2553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 16 -#define SGEMM_DEFAULT_UNROLL_N 4 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 - -#define SGEMM_DEFAULT_P 512 -#define DGEMM_DEFAULT_P 256 -#define CGEMM_DEFAULT_P 256 -#define ZGEMM_DEFAULT_P 128 - -#define SGEMM_DEFAULT_Q 1024 -#define DGEMM_DEFAULT_Q 512 -#define CGEMM_DEFAULT_Q 512 -#define ZGEMM_DEFAULT_Q 512 - -#define SGEMM_DEFAULT_R 4096 -#define DGEMM_DEFAULT_R 4096 -#define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 2048 - - #define SYMV_P 16 -#endif - -#if defined(ARMV8) +// Darwin / Cross #if defined(OS_DARWIN) && defined(CROSS) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2620,15 +2585,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SYMV_P 16 -#else +#else // Linux / Native -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#if defined(CORTEXA53) || defined(CORTEXA57) || \ + defined(CORTEXA72) || defined(CORTEXA73) || \ + defined(FALKOR) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2642,33 +2603,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 -#define CGEMM_DEFAULT_P 128 +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 112 +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 512 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 512 #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 - -#define SYMV_P 16 -#endif - -#endif - -#if defined(THUNDERX) -#define SNUMOPT 2 -#define DNUMOPT 2 +#define ZGEMM_DEFAULT_R 2048 -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#elif defined(THUNDERX) #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2697,17 +2647,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(THUNDERX2T99) -#define SYMV_P 16 -#endif +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 -#if defined(THUNDERX2T99) || defined(VULCAN) -#define SNUMOPT 2 -#define DNUMOPT 2 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#else // Other/undetected ARMv8 cores #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2736,8 +2705,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SYMV_P 16 -#endif +#endif // Cores + +#endif // Linux / Darwin + +#endif // ARMv8 #if defined(ARMV5) #define SNUMOPT 2 From 5192651706d39b35e82b6f62f2b02764cdb3983c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Nov 2018 17:58:22 +0100 Subject: [PATCH 283/935] Add CriticalSection handling instead of mutexes for Windows --- driver/level3/level3_thread.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 15cad9274..ac96f9424 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -515,7 +515,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG nthreads_m, BLASLONG nthreads_n) { #ifndef USE_OPENMP +#ifndef OS_WINDOWS static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#else +CRITICAL_SECTION level3_lock; +InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock; +#endif #endif blas_arg_t newarg; @@ -559,7 +564,11 @@ static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; #endif #ifndef USE_OPENMP +#ifndef OS_WINDOWS pthread_mutex_lock(&level3_lock); +#else +EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif #endif #ifdef USE_ALLOC_HEAP @@ -680,7 +689,11 @@ pthread_mutex_lock(&level3_lock); #endif #ifndef USE_OPENMP +#ifndef OS_WINDOWS pthread_mutex_unlock(&level3_lock); +#else + LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif #endif return 0; From 113cb00b95626d037647107aaa1f00027772b0da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Nov 2018 21:01:36 +0100 Subject: [PATCH 284/935] fix missing parenthesis --- driver/level3/level3_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ac96f9424..3411a3e9b 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -519,7 +519,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; #else CRITICAL_SECTION level3_lock; -InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock; +InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); #endif #endif From 0184713e1a2c3ae99f500edce105ab0f42e96de6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Bissey?= Date: Wed, 21 Nov 2018 14:24:56 +1300 Subject: [PATCH 285/935] Correct link flags for PGI compiler. --- f_check | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/f_check b/f_check index 997e02393..34caa00be 100644 --- a/f_check +++ b/f_check @@ -292,9 +292,6 @@ if ($link ne "") { && ($flags !~ /^-LIST:/) && ($flags !~ /^-LANG:/) ) { - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= $flags . " "; } @@ -311,17 +308,11 @@ if ($link ne "") { if ($flags =~ /^\-rpath\@/) { $flags =~ s/\@/\,/g; - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= "-Wl,". $flags . " " ; } if ($flags =~ /^\-rpath-link\@/) { $flags =~ s/\@/\,/g; - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= "-Wl,". $flags . " " ; } @@ -330,7 +321,6 @@ if ($link ne "") { && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) - && ($flags !~ /numa/) && ($flags !~ /crt[0-9]/) && ($flags !~ /gcc/) && ($flags !~ /user32/) From 19c4bdd8b3f3fc5a97a5b756f6590bdb6d2a3ee9 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 25 Nov 2018 21:35:01 +0100 Subject: [PATCH 287/935] Add return value so that freebsd system clang does not err out --- kernel/x86_64/sgemm_beta_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 4e40acadf..498c46f0d 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, } if (n == 0 || m == 0) - return; + return 0; c_offset = c; From 816775e3099cba07b4ad2636090c1f752d9f8b3e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 00:06:44 +0100 Subject: [PATCH 288/935] Add version information to openblas_get_config output --- driver/others/openblas_get_config.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 3e87f2cc2..471be21bc 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif static char* openblas_config_str="" +"VERSION " + VERSION #ifdef USE64BITINT - "USE64BITINT " + " USE64BITINT " #endif #ifdef NO_CBLAS "NO_CBLAS " From a29ec458c238a9b1183baaf6d5c99d14d206987a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 00:10:49 +0100 Subject: [PATCH 289/935] propagate verison number for openblas_config_version --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index 1427d09fb..22fe24337 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1036,6 +1036,8 @@ ifdef USE_TLS CCOMMON_OPT += -DUSE_TLS endif +CCOMMON_OPT += -DVERSION=\"$(VERSION)\" + ifndef SYMBOLPREFIX SYMBOLPREFIX = endif From 081ceb3e029e04b3a2773915cc67dc848bab3ef2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 00:12:04 +0100 Subject: [PATCH 290/935] Propagate version number for openblas_get_config --- cmake/system.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 61f96edb0..d803bb9eb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -310,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") endif () +set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"") + set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) From de0d0ed52f314a6b370fab03bc21ebbb3d943bbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 11:28:19 +0100 Subject: [PATCH 291/935] Improve formatting of config output --- driver/others/openblas_get_config.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 471be21bc..4f22325b6 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -44,6 +44,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static char* openblas_config_str="" "VERSION " VERSION +" " #ifdef USE64BITINT " USE64BITINT " #endif From 97d72989739163171930046dba8d7a3214f49b9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Nov 2018 11:52:08 +0100 Subject: [PATCH 292/935] call it OpenBLAS not just version --- driver/others/openblas_get_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 4f22325b6..eca494dca 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif static char* openblas_config_str="" -"VERSION " +"OpenBLAS " VERSION " " #ifdef USE64BITINT From 7a2e1bc8041a898cadea475a0562e5b40ec49750 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Nov 2018 10:57:09 +0100 Subject: [PATCH 293/935] Use generic kernel for DSDOT/SDSDOT as discussed in #1834 --- kernel/mips64/KERNEL.LOONGSON3A | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 2d03ad7fa..0298faaad 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -63,6 +63,7 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DSDOTKERNEL = ../mips/dot.c From 95a5542e3c21def6e63e9de8b5c1850830fc0289 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Nov 2018 11:16:24 +0100 Subject: [PATCH 294/935] Revert DOT kernel changes from #1834 as the failures seen on Loongson3A appear to be limited to DSDOT/SDSDOT (i.e. my hackish "fix" from #1684) --- kernel/mips64/KERNEL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index e257dcfc9..f77ca19ed 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,9 +1,9 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c -SROTKERNEL = ../mips/rot.c -DROTKERNEL = ../mips/rot.c -CROTKERNEL = ../mips/zrot.c -ZROTKERNEL = ../mips/zrot.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c From 2601cd58ab55d0b76c305bde1d320b8ab0da25ed Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Thu, 11 Oct 2018 23:29:34 +0300 Subject: [PATCH 295/935] remove surplus locking code , only enabled w x86, disabled or never enabled on all others --- driver/others/memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 25f198623..36815a39c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2586,7 +2586,7 @@ void *blas_memory_alloc(int procpos){ printf("Alloc Start ...\n"); #endif -#if defined(WHEREAMI) && !defined(USE_OPENMP) +/* #if defined(WHEREAMI) && !defined(USE_OPENMP) mypos = WhereAmI(); @@ -2596,12 +2596,12 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { LOCK_COMMAND(&alloc_lock); -/* blas_lock(&memory[position].lock);*/ +// blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ +// blas_unlock(&memory[position].lock); } position ++; @@ -2609,7 +2609,7 @@ void *blas_memory_alloc(int procpos){ } while (position < NUM_BUFFERS); -#endif +#endif */ position = 0; From f85ce54d4a2c23b27d80ec454e150b5388d5d38c Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 30 Nov 2018 16:05:49 +0000 Subject: [PATCH 296/935] Use correct Makefile on powerpc64 FreeBSD uses powerpc64 name for POWER architecture. Use correct Makefile for this platform. --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 22fe24337..bf2b76fae 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1060,7 +1060,11 @@ endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) +ifneq ($(ARCH), powerpc64) include $(TOPDIR)/Makefile.$(ARCH) +else +include $(TOPDIR)/Makefile.power +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" From 731b2722ba4ba25d982682e47cbad0b780bd24d3 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 30 Nov 2018 16:04:07 +0000 Subject: [PATCH 297/935] Fix build on POWER, remove DragonFly, add NetBSD __asm is complete on its own DBSD developers state they will only support amd64, but NetBSD supports POWER. --- cpuid_power.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index fc36f8e2c..23e98ebb0 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -175,9 +175,9 @@ int detect(void){ return CPUTYPE_PPC970; #endif -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) int id; -id = __asm __volatile("mfpvr %0" : "=r"(id)); +__asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 return CPUTYPE_POWER8; From 6c7b69108300511f4b4bece422c62a7e4ff89d87 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Nov 2018 21:32:01 +0100 Subject: [PATCH 298/935] Really revert xDOT changes from 1832 neglected to rebase #1892 on merging --- kernel/mips64/KERNEL | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 3a26b820c..61da7445f 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -6,12 +6,8 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c -SDOTKERNEL = ../mips/dot.c -DDOTKERNEL = ../mips/dot.c -CDOTKERNEL = ../mips/zdot.c -ZDOTKERNEL = ../mips/zdot.c - - + + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif From dceff5542ce5aaf9b0a7198612c7fdf36228f3bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Dec 2018 20:56:11 +0100 Subject: [PATCH 299/935] Handle Android environments that identify as Linux (#1898) * Handle Android environments that identify as Linux termux terminal emulator does this, causing build failures through missed defines in common.h --- cmake/system_check.cmake | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index fe30c7600..6b602c1b0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS") set(HOST_OS WINNT) endif () +if (${HOST_OS} STREQUAL "LINUX") +# check if we're building natively on Android (TERMUX) + EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) + if(${OPERATING_SYSTEM} MATCHES "Android") + set(HOST_OS ANDROID) + endif(${OPERATING_SYSTEM} MATCHES "Android") +endif() + + + if(CMAKE_COMPILER_IS_GNUCC AND WIN32) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE From 26b3710485dbcd614f352713a2fc2637741fa25a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 2 Dec 2018 12:07:41 +0100 Subject: [PATCH 301/935] Add architecture mappings for FreeBSD12 --- Makefile.system | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 1427d09fb..42f446996 100644 --- a/Makefile.system +++ b/Makefile.system @@ -12,7 +12,13 @@ endif # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 -endif +else ifeq ($(ARCH), powerpc64) +override ARCH=power +else ifeq (($ARCH), i386) +override ARCH=x86 +else ifeq ($(ARCH), aarch64) +override ARCH=arm64 +endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib From 44c81fd1355cef9b07189ebaad061709be0cd7c6 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 2 Dec 2018 20:27:53 +0100 Subject: [PATCH 302/935] oops --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 42f446996..25ac38dc0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -14,7 +14,7 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power -else ifeq (($ARCH), i386) +else ifeq ($(ARCH), i386) override ARCH=x86 else ifeq ($(ARCH), aarch64) override ARCH=arm64 From 3c9e3faedb1d861dc094ebff0c508c679c4a3cb8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:24:53 +0100 Subject: [PATCH 303/935] fixup BSD naming of powerpc arch --- Makefile.system | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index bf2b76fae..6919c0114 100644 --- a/Makefile.system +++ b/Makefile.system @@ -11,7 +11,11 @@ endif # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) -override ARCH=x86_64 +override ARCH=x86_64 +else ifeq ($(ARCH), powerpc64) +override ARCH=power +endif + endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib @@ -1060,11 +1064,7 @@ endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) -ifneq ($(ARCH), powerpc64) include $(TOPDIR)/Makefile.$(ARCH) -else -include $(TOPDIR)/Makefile.power -endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" From c0827a716473bd61d3e8fa44c25184d370400267 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:41:17 +0100 Subject: [PATCH 304/935] Update with changes from 0.3.4 --- Changelog.txt | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index faecd82e3..0dd17a558 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,77 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.4 +02-Dec-2018 + +common: + * the new, experimental thread-local memory allocation had + inadvertently been left enabled for gmake builds in 0.3.3 + despite the announcement. It is now disabled by default, and + single-threaded builds will keep using the old allocator even + if the USE_TLS option is turned on. + * OpenBLAS will now provide enough buffer space for at least 50 + threads by default. + * The output of openblas_get_config() now contains the version + number. + * A serious thread safety bug in GEMV operation with small M and + large N size has been fixed. + * The code will now automatically call blas_thread_init after a + fork if needed before handling a call to openblas_set_num_threads + * Accesses to parallelized level3 functions from multiple callers + are now serialized to avoid thread races (unless using OpenMP). + This should provide better performance than the known-threadsafe + (but non-default) USE_SIMPLE_THREADED_LEVEL3 option. + * When building LAPACK with gfortran, -frecursive is now (again) + enabled by default to ensure correct behaviour. + * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and + CBLAS_LAYOUT as the name of the matrix row/column order option. + * Externally set LDFLAGS are now passed through to the final compile/link + steps to facilitate setting platform-specific linker flags. + * A potential race condition during the build of LAPACK (that would + usually manifest itself as a failure to build TESTING/MATGEN) has been + fixed. + * xHEMV has been changed to stay single-threaded for small input sizes + where the overhead of multithreading exceeds any possible gains + * CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or + ThunderX hardware with sizable input. + * Linker flags for the PGI compiler have been updated + * Behaviour of AXPY with zero increments is now handled in the C interface, + correcting the result on at least Intel Atom. + * The result matrix from calling SGELSS with an all-zero input matrix is + now zeroed completely. + +x86_64: + * Autodetection of AMD Ryzen2 has been fixed (again). + * CMAKE builds now support labeling of an INTERFACE64=1 build of + the library with the _64 suffix. + * AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel + has been sped up by rewriting with C intrinsics + * Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS) + +POWER: + * added support for building on AIX (with gcc and GNU tools from AIX Toolbox). + * CPU type detection has been implemented for AIX. + * CPU type detection has been fixed for NETBSD. + +MIPS64: + * AXPY on LOONGSON3A has been corrected to pass "zero increment" utest. + * DSDOT on LOONGSON3A has been fixed. + * the SGEMM microkernel has been hardened against potential data loss. + +ARMV8: + * DYNAMic_ARCH support is now available for 64bit ARM + * cross-compiling for ARMV8 under iOS now works. + * cpu-specific code has been rearranged to make better use of both + hardware commonalities and model-specific compiler optimizations. + * XGENE1 has been removed as a TARGET, superseded by the improved generic + ARMV8 support. + +ARMV7: + * Older assembly mnemonics have been converted to UAL form to allow + building with clang 7.0 + * Cross compiling LAPACKE for Android has been fixed again (broken by + update to LAPACK 3.7.0 some while ago). + ==================================================================== Version 0.3.3 31-Aug-2018 From 93fa6b7b76ffbd56ffce54ac11467d580f53537c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:42:33 +0100 Subject: [PATCH 305/935] Increment version to 0.3.5.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 296113941..24c169afe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 4) +set(OpenBLAS_PATCH_VERSION 5.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From f5acaad8f0590502e26539917a0704e572e17abc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:43:15 +0100 Subject: [PATCH 306/935] Increment version to 0.3.5.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index f3086a01b..0d5b83b39 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.4 +VERSION = 0.3.5.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 360374be62cab8f5be8baecfa675da59a571608d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Dec 2018 23:44:13 +0100 Subject: [PATCH 307/935] Update with the changes from 0.3.4 --- Changelog.txt | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index faecd82e3..0dd17a558 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,77 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.4 +02-Dec-2018 + +common: + * the new, experimental thread-local memory allocation had + inadvertently been left enabled for gmake builds in 0.3.3 + despite the announcement. It is now disabled by default, and + single-threaded builds will keep using the old allocator even + if the USE_TLS option is turned on. + * OpenBLAS will now provide enough buffer space for at least 50 + threads by default. + * The output of openblas_get_config() now contains the version + number. + * A serious thread safety bug in GEMV operation with small M and + large N size has been fixed. + * The code will now automatically call blas_thread_init after a + fork if needed before handling a call to openblas_set_num_threads + * Accesses to parallelized level3 functions from multiple callers + are now serialized to avoid thread races (unless using OpenMP). + This should provide better performance than the known-threadsafe + (but non-default) USE_SIMPLE_THREADED_LEVEL3 option. + * When building LAPACK with gfortran, -frecursive is now (again) + enabled by default to ensure correct behaviour. + * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and + CBLAS_LAYOUT as the name of the matrix row/column order option. + * Externally set LDFLAGS are now passed through to the final compile/link + steps to facilitate setting platform-specific linker flags. + * A potential race condition during the build of LAPACK (that would + usually manifest itself as a failure to build TESTING/MATGEN) has been + fixed. + * xHEMV has been changed to stay single-threaded for small input sizes + where the overhead of multithreading exceeds any possible gains + * CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or + ThunderX hardware with sizable input. + * Linker flags for the PGI compiler have been updated + * Behaviour of AXPY with zero increments is now handled in the C interface, + correcting the result on at least Intel Atom. + * The result matrix from calling SGELSS with an all-zero input matrix is + now zeroed completely. + +x86_64: + * Autodetection of AMD Ryzen2 has been fixed (again). + * CMAKE builds now support labeling of an INTERFACE64=1 build of + the library with the _64 suffix. + * AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel + has been sped up by rewriting with C intrinsics + * Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS) + +POWER: + * added support for building on AIX (with gcc and GNU tools from AIX Toolbox). + * CPU type detection has been implemented for AIX. + * CPU type detection has been fixed for NETBSD. + +MIPS64: + * AXPY on LOONGSON3A has been corrected to pass "zero increment" utest. + * DSDOT on LOONGSON3A has been fixed. + * the SGEMM microkernel has been hardened against potential data loss. + +ARMV8: + * DYNAMic_ARCH support is now available for 64bit ARM + * cross-compiling for ARMV8 under iOS now works. + * cpu-specific code has been rearranged to make better use of both + hardware commonalities and model-specific compiler optimizations. + * XGENE1 has been removed as a TARGET, superseded by the improved generic + ARMV8 support. + +ARMV7: + * Older assembly mnemonics have been converted to UAL form to allow + building with clang 7.0 + * Cross compiling LAPACKE for Android has been fixed again (broken by + update to LAPACK 3.7.0 some while ago). + ==================================================================== Version 0.3.3 31-Aug-2018 From ea6d1b96bd3fdaf8e8b4d912bdd906cbcb9b1bbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Dec 2018 08:59:10 +0100 Subject: [PATCH 308/935] Update Makefile.system --- Makefile.system | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 6919c0114..3cf5a16b2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -16,8 +16,6 @@ else ifeq ($(ARCH), powerpc64) override ARCH=power endif -endif - NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # Default C compiler From 701ea88347461e4c5d896765438dc870281b3834 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Dec 2018 13:06:43 +0100 Subject: [PATCH 309/935] Use p2align instead of align for OSX compatibility fixes #1902 --- kernel/x86_64/dgemm_kernel_4x8_skylakex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c index a83ca98fa..6257e569e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -869,7 +869,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovapd %%zmm1, %%zmm27\n" "vmovapd %%zmm1, %%zmm28\n" "jmp .label24\n" - ".align 32\n" + ".p2align 5\n" /* Inner math loop */ ".label24:\n" "vmovupd -128(%[AO]),%%zmm0\n" @@ -1037,7 +1037,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovapd %%zmm1, %%zmm17\n" "vmovapd %%zmm1, %%zmm18\n" "jmp .label16\n" - ".align 32\n" + ".p2align 5\n" /* Inner math loop */ ".label16:\n" "vmovupd -128(%[AO]),%%zmm0\n" @@ -1165,7 +1165,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, "vmovapd %%zmm1, %%zmm8\n" "vbroadcastsd (%[alpha]), %%zmm9\n" "jmp .label1\n" - ".align 32\n" + ".p2align 5\n" /* Inner math loop */ ".label1:\n" "vmovupd -128(%[AO]),%%zmm0\n" From 31a490ea887dd078233aebffc5a57a093fe2d886 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Wed, 5 Dec 2018 18:51:38 +0000 Subject: [PATCH 310/935] Fix two mistakes on Arm64 builds * Falkor is an ARMv8.0 with ARMv8.1 features, and chosing armv8.1-a for march generates instructions it cannot cope with. Reverting it back to armv8-a. * ThunderX2's build was left with a #define VULCAN, which made it miss the right compiler flags in Makefile.arm64, although it did create the right library in the end. --- Makefile.arm64 | 4 ++-- cpuid_arm64.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index a529fab80..cd16dbfae 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -30,8 +30,8 @@ FCOMMON_OPT += -march=armv8-a -mtune=thunderx endif ifeq ($(CORE), FALKOR) -CCOMMON_OPT += -march=armv8.1-a -mtune=falkor -FCOMMON_OPT += -march=armv8.1-a -mtune=falkor +CCOMMON_OPT += -march=armv8-a -mtune=falkor +FCOMMON_OPT += -march=armv8-a -mtune=falkor endif ifeq ($(CORE), THUNDERX2T99) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index c914fbc2b..5077d7b11 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -270,7 +270,7 @@ void get_cpuconfig(void) break; case CPU_THUNDERX2T99: - printf("#define VULCAN \n"); + printf("#define THUNDERX2T99 \n"); printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n"); From 6ba30e270d0a6988e02f45cd0b5ef2b505c5619c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 13:42:25 +0100 Subject: [PATCH 311/935] Fix typo that broke CNRM2 on ARMV8 since 0.3.0 must have happened in my #1449 --- kernel/arm64/KERNEL.ARMV8 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 5c70390dc..07d6cee99 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -93,8 +93,8 @@ IZAMAXKERNEL = izamax.S ifneq ($(OS_DARWIN)$(CROSS),11) SNRM2KERNEL = nrm2.S -CNRM2KERNEL = nrm2.S -DNRM2KERNEL = znrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S endif From 2fc712469d1e29220e2e3f3f83d2ab7b17c0bc60 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 13:56:06 +0100 Subject: [PATCH 312/935] Avoid creating spurious non-suffixed c/zgemm_kernels Plain cgemm_kernel and zgemm_kernel are not used anywhere, only cgemm_kernel_b etc. Needlessly building them (without any define like NN, CN, etc.) just happened to work on most platforms, but not on arm64. See #1870 --- kernel/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 947114ebe..2a330df4e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -125,10 +125,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () - foreach (float_type ${FLOAT_TYPES}) + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) + endforeach() + foreach (float_type ${FLOAT_TYPES}) + string(SUBSTRING ${float_type} 0 1 float_char) if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () From 7639f2e1f004d441757a43bcdfff6c32611a2aa3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 14:04:27 +0100 Subject: [PATCH 313/935] Rewrite the conditional for OSX to fix cmake parsing on others The Makefile variable parser in utils.cmake currently does not handle conditionals. Having the definitions for non-OSX last will at least make cmake builds work again on non-OSX platforms. --- kernel/arm64/KERNEL.ARMV8 | 63 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 07d6cee99..a2a435738 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -104,8 +104,38 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S -ifneq ($(OS_DARWIN)$(CROSS),11) +ifeq ($(OS_DARWIN)$(CROSS),11) + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +else SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -173,35 +203,4 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -else - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - endif From 0b095166788b28dc9270edca2eb62ef2f201f6fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Dec 2018 18:33:05 +0100 Subject: [PATCH 314/935] Fix missing parameter in popen call --- cpuid_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_power.c b/cpuid_power.c index 23e98ebb0..82a3f4aac 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -136,7 +136,7 @@ int detect(void){ char buffer[512], *p; p = (char *)NULL; - infile = popen("prtconf|grep 'Processor Type'"); + infile = popen("prtconf|grep 'Processor Type'", "r"); while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("Pro", buffer, 3)){ p = strchr(buffer, ':') + 2; From 2b355592e34b07f4d0c5f81c275c902c0578236d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 16:25:55 +0100 Subject: [PATCH 315/935] Make sure to use the arm version of dynamic.c in ARM64 DYNAMIC_ARCH cf. #1908 --- driver/others/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index e20b14e79..f7cce4d46 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1) GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) if (DYNAMIC_ARCH) - list(APPEND COMMON_SOURCES dynamic.c) + if (ARM64) + list(APPEND COMMON_SOURcES dynamic_arm64.c) + else () + list(APPEND COMMON_SOURCES dynamic.c) + endif () else () list(APPEND COMMON_SOURCES parameter.c) endif () From 133c278ee565e91ff65d627b363aee36b71feeba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 17:42:23 +0100 Subject: [PATCH 316/935] Add DYNAMIC_CORE list for ARM64 cf #1908 --- cmake/arch.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 52fb64eaa..63fb86fa2 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,6 +44,10 @@ endif () if (DYNAMIC_ARCH) + if (ARM64) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) + endif () + if (X86) set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) endif () From 0bf6d74e5f9855ddf2028dcc099ee58e4f13446b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Dec 2018 19:37:33 +0100 Subject: [PATCH 317/935] Fix typo in previous commit for arm dynamic arch --- driver/others/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index f7cce4d46..a07e00b3b 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -48,7 +48,7 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" if (DYNAMIC_ARCH) if (ARM64) - list(APPEND COMMON_SOURcES dynamic_arm64.c) + list(APPEND COMMON_SOURCES dynamic_arm64.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () From 38cc63859131921885b80ed5139304dc80c5a163 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 21:09:26 +0100 Subject: [PATCH 318/935] Avoid adding blanket march=skylake-avx512 to dynamic_arch builds --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f2647fb7d..dbee28079 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,6 +9,7 @@ endif endif ifeq ($(CORE), SKYLAKEX) +ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 @@ -22,6 +23,7 @@ endif endif endif endif +endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 From 06f7d78d70b95f936765312b8c8b3cadf7265ae5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 21:10:38 +0100 Subject: [PATCH 319/935] Add -march=skylake-avx512 to SkylakeX part of DYNAMIC_ARCH builds --- kernel/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index 923ffc363..6e178f80b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,7 +6,11 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system ifdef TARGET_CORE +ifeq ($(TARGET_CORE), SKYLAKEX) +override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 +else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) +endif BUILD_KERNEL = 1 KDIR = TSUFFIX = _$(TARGET_CORE) From 51aec8e96b78f93f9a6dcbbf1edd212c5f1ab2ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Dec 2018 22:47:32 +0100 Subject: [PATCH 320/935] make sure the added march=skylake-avx512 does not cause problems on Windows --- kernel/Makefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 6e178f80b..a441bde7c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,15 @@ include $(TOPDIR)/Makefile.system ifdef TARGET_CORE ifeq ($(TARGET_CORE), SKYLAKEX) -override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From cdc668d82b7afd6a2ddee33987ecfebcaccebc2d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 12 Dec 2018 16:45:57 +0000 Subject: [PATCH 321/935] Add a "sgemm direct" mode for small matrixes OpenBLAS has a fancy algorithm for copying the input data while laying it out in a more CPU friendly memory layout. This is great for large matrixes; the cost of the copy is easily ammortized by the gains from the better memory layout. But for small matrixes (on CPUs that can do efficient unaligned loads) this copy can be a net loss. This patch adds (for SKYLAKEX initially) a "sgemm direct" mode, that bypasses the whole copy machinary for ALPHA=1/BETA=0/... standard arguments, for small matrixes only. What is small? For the non-threaded case this has been measured to be in the M*N*K = 28 * 512 * 512 range, while in the threaded case it's less, around M*N*K = 1 * 512 * 512 --- common_level3.h | 8 + interface/gemm.c | 8 + kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 467 ++++++++++++++++++++- param.h | 1 + 4 files changed, 483 insertions(+), 1 deletion(-) diff --git a/common_level3.h b/common_level3.h index 1f5490baa..6fa902be8 100644 --- a/common_level3.h +++ b/common_level3.h @@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); extern "C" { #endif +extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, + float * A, BLASLONG strideA, + float * B, BLASLONG strideB, + float * R, BLASLONG strideR); + +extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); + + int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, diff --git a/interface/gemm.c b/interface/gemm.c index a3bac5984..97e71bc85 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -271,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; +#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { + sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); + return; + } + +#endif + #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 10d3d22ed..3246e681f 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -760,7 +760,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************************/ int __attribute__ ((noinline)) -CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc) { unsigned long M = m, N = n, K = k; if (M == 0) @@ -1175,3 +1175,468 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; } + + +/* + * "Direct sgemm" code. This code operates directly on the inputs and outputs + * of the sgemm call, avoiding the copies, memory realignments and threading, + * and only supports alpha = 1 and beta = 0. + * This is a common case and provides value for relatively small matrixes. + * For larger matrixes the "regular" sgemm code is superior, there the cost of + * copying/shuffling the B matrix really pays off. + */ + + + +#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)]) +#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M) + + +#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps() +#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)]) +#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M) + +#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps() +#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)]) +#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M) + +#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0; +#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)]; +#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N]; +#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; +#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; + +int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) +{ + int mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + + +void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{ + int i, j, k; + + int m4 = M & ~3; + int m2 = M & ~1; + + int n64 = N & ~63; + int n32 = N & ~31; + int n16 = N & ~15; + int n8 = N & ~7; + int n4 = N & ~3; + int n2 = N & ~1; + + i = 0; + + for (i = 0; i < m4; i+=4) { + + for (j = 0; j < n64; j+= 64) { + k = 0; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + DECLARE_RESULT_256(0, 2); + DECLARE_RESULT_256(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + BROADCAST_LOAD_A_256(x, 2); + BROADCAST_LOAD_A_256(x, 3); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + MATMUL_256(0, 2); + MATMUL_256(0, 3); + } + STORE_256(0, 0); + STORE_256(0, 1); + STORE_256(0, 2); + STORE_256(0, 3); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + DECLARE_RESULT_128(0, 2); + DECLARE_RESULT_128(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + BROADCAST_LOAD_A_128(x, 2); + BROADCAST_LOAD_A_128(x, 3); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + MATMUL_128(0, 2); + MATMUL_128(0, 3); + } + STORE_128(0, 0); + STORE_128(0, 1); + STORE_128(0, 2); + STORE_128(0, 3); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2); + DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + BROADCAST_LOAD_A_SCALAR(x, 2); + BROADCAST_LOAD_A_SCALAR(x, 3); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2); + MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + STORE_SCALAR(0, 2); STORE_SCALAR(1, 2); + STORE_SCALAR(0, 3); STORE_SCALAR(1, 3); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0) + DECLARE_RESULT_SCALAR(0, 1) + DECLARE_RESULT_SCALAR(0, 2) + DECLARE_RESULT_SCALAR(0, 3) + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + BROADCAST_LOAD_A_SCALAR(0, 2); + BROADCAST_LOAD_A_SCALAR(0, 3); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + MATMUL_SCALAR(0, 2); + MATMUL_SCALAR(0, 3); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + STORE_SCALAR(0, 2); + STORE_SCALAR(0, 3); + } + } + + for (; i < m2; i+=2) { + j = 0; + + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + } + STORE_256(0, 0); + STORE_256(0, 1); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + } + STORE_128(0, 0); + STORE_128(0, 1); + } + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + DECLARE_RESULT_SCALAR(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + } + } + + for (; i < M; i+=1) { + j = 0; + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + LOAD_B_256(0, x); + MATMUL_256(0, 0); + } + STORE_256(0, 0); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + LOAD_B_128(0, x); + MATMUL_128(0, 0); + } + STORE_128(0, 0); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0); + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + LOAD_B_SCALAR(0, 0); + MATMUL_SCALAR(0, 0); + } + STORE_SCALAR(0, 0); + } + } +} \ No newline at end of file diff --git a/param.h b/param.h index 8f56cdaaa..7a18d82d7 100644 --- a/param.h +++ b/param.h @@ -1628,6 +1628,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SWITCH_RATIO 32 #define GEMM_PREFERED_SIZE 32 +#define USE_SGEMM_KERNEL_DIRECT 1 #ifdef ARCH_X86 From 00dc09ad198aedec53fd05ea1b13d72d7a9a517a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:18:59 +0000 Subject: [PATCH 322/935] Use the skylake sgemm beta code also for haswell with a few small changes it's possible to use the skylake sgemm code also for haswell, this gives a modest gain (10% range) for smallish matrixes but does wonders for very skinny matrixes --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/sgemm_beta_skylakex.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 848de38df..2aec60064 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -33,6 +33,7 @@ ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S +SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 498c46f0d..e8653112c 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,11 +61,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512 z_zero; - __m256 y_zero; +#ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); +#endif + __m256 y_zero = _mm256_setzero_ps(); - z_zero = _mm512_setzero_ps(); - y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -74,8 +74,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; while (i >= 32) { +#ifdef __AVX512CD__ _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); +#else + _mm256_storeu_ps(c_offset1, y_zero); + _mm256_storeu_ps(c_offset1 + 8, y_zero); + _mm256_storeu_ps(c_offset1 + 16, y_zero); + _mm256_storeu_ps(c_offset1 + 24, y_zero); +#endif c_offset1 += 32; i -= 32; } From 0586899a10b97bf1baf50e4988d18b4268317420 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 15 Dec 2018 13:43:07 +0000 Subject: [PATCH 323/935] Use sgemm_ncopy_4_skylakex.c also for Haswell sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the real perf win happens; this also works great for Haswell. This gives double digit percentage gains on small and skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/sgemm_ncopy_4_skylakex.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 2aec60064..422e6c315 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -36,7 +36,7 @@ SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c index 8577e3b38..6b2b0f5b1 100644 --- a/kernel/x86_64/sgemm_ncopy_4_skylakex.c +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp9, ctemp13; a_offset = a; b_offset = b; From 1ebe5c0f499575d42e85b4f89e4205882be8ebe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 19:35:35 +0100 Subject: [PATCH 324/935] Add -march=haswell to HASWELL part of DYNAMIC_ARCH build --- kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index a441bde7c..d86411d91 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif +elseifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 2a3190dc76a3eb60fabe298b1df04c46cdca5350 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:17:44 +0100 Subject: [PATCH 325/935] fix elseifeq and use older option core2-avx for compatibility --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index d86411d91..169c7f79c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,8 +16,8 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -elseifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=haswell +else ifeq($(TARGET_CORE), HASWELL) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From fbcb14a74bb252ea344f5b10d3d741268326906f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 20:18:59 +0100 Subject: [PATCH 326/935] should be core-avx2 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 169c7f79c..a9208619f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core2-avx + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 3843e3e01781970690325542fe15a722f87407c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Dec 2018 23:30:31 +0100 Subject: [PATCH 327/935] use -maxv2 on haswell --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index a9208619f..b01893175 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,7 +17,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=core-avx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 69d206440ab669794201d65d4e8087060e519474 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 00:19:41 +0000 Subject: [PATCH 328/935] Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support --- kernel/x86_64/sgemm_beta_skylakex.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index e8653112c..cdc9c44be 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -61,10 +61,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ -#ifdef __AVX512CD__ - __m512 z_zero = _mm512_setzero_ps(); -#endif - __m256 y_zero = _mm256_setzero_ps(); j = n; do { @@ -72,12 +68,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset += ldc; i = m; - +#ifdef __AVX2__ while (i >= 32) { #ifdef __AVX512CD__ + __m512 z_zero = _mm512_setzero_ps(); _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); #else + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); _mm256_storeu_ps(c_offset1 + 8, y_zero); _mm256_storeu_ps(c_offset1 + 16, y_zero); @@ -87,11 +85,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i -= 32; } while (i >= 8) { + __m256 y_zero = _mm256_setzero_ps(); _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From 545c2b1bbbbe9a1c548150189e54fc76e62e4b13 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 13:09:19 +0100 Subject: [PATCH 329/935] Add -mavx2 on Haswell only if the compiler supports it --- kernel/Makefile | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index b01893175..17bfd4063 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,27 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +AVX2OPT = +ifeq ($(C_COMPILER), GCC) +# AVX2 support was added in 4.7.0 + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) + AVX2OPT = -mavx2 + endif +endif +ifeq ($(C_COMPILER), CLANG) +# Any clang posing as gcc 4.2 should be new enough (3.4 or later) + GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) + GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) + ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) + AVX2OPT -mavx2 + endif +endif +ifdef NO_AVX2 + AVX2OPT= +endif + ifdef TARGET_CORE ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 @@ -17,9 +38,9 @@ ifeq ($(TARGET_CORE), SKYLAKEX) endif endif else ifeq($(TARGET_CORE), HASWELL) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -mavx2 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else -override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif BUILD_KERNEL = 1 KDIR = From cfc4acc221344d53d72550d157c5050ddaa26ed7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 16:19:51 +0100 Subject: [PATCH 330/935] typo --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 17bfd4063..30292cd80 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -19,7 +19,7 @@ ifeq ($(C_COMPILER), CLANG) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) - AVX2OPT -mavx2 + AVX2OPT = -mavx2 endif endif ifdef NO_AVX2 From c4e23dd016ed2852ebf59a0d744deb55a48e66c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Dec 2018 18:14:40 +0100 Subject: [PATCH 331/935] Update Makefile --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 30292cd80..e81225075 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -37,7 +37,7 @@ ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -fno-asynchronous-unwind-tables endif endif -else ifeq($(TARGET_CORE), HASWELL) +else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From c43331ad0aeaefe4b4d90aab06c93655c851feab Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 22:59:02 +0000 Subject: [PATCH 332/935] dgemm: Use the skylakex beta function also for haswell it's more efficient for certain tall/skinny matrices --- kernel/x86_64/KERNEL.HASWELL | 1 + kernel/x86_64/dgemm_beta_skylakex.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 422e6c315..4cd67a705 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -45,6 +45,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S +DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 6a824c9b5..8c24725a1 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -61,17 +61,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset = c; if (beta == ZERO){ - __m512d z_zero; - z_zero = _mm512_setzero_pd(); j = n; do { c_offset1 = c_offset; c_offset += ldc; i = m; - +#ifdef __AVX2__ +#ifdef __AVX512CD__ while (i >= 32) { + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -79,12 +79,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } +#endif while (i >= 8) { +#ifdef __AVX512CD__ + __m512d z_zero = _mm512_setzero_pd(); _mm512_storeu_pd(c_offset1, z_zero); +#else + __m256d y_zero = _mm256_setzero_pd(); + _mm256_storeu_pd(c_offset1, y_zero); + _mm256_storeu_pd(c_offset1 + 4, y_zero); +#endif c_offset1 += 8; i -= 8; } - +#endif while (i > 0) { *c_offset1 = ZERO; c_offset1 ++; From d321448a63954d536f90592cd0cc53c304b08d2e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:06:58 +0000 Subject: [PATCH 333/935] dgemm: use dgemm_ncopy_8_skylakex.c also for Haswell The dgemm_ncopy_8_skylakex.c code is not avx512 specific and gives a nice performance boost for medium sized matrices --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 4cd67a705..f98728a41 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -48,7 +48,7 @@ DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMM_BETA = dgemm_beta_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) From b28f75cd7e61cf5bdcf404ebece07f75553ecde0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 16 Dec 2018 23:08:31 +0000 Subject: [PATCH 334/935] set GEMM_PREFERED_SIZE for HASWELL Haswell likes a GEMM_PREFERED_SIZE of 16 to improve the split that the threading code does to make it a nice multiple of the SIMD kernel size --- param.h | 1 + 1 file changed, 1 insertion(+) diff --git a/param.h b/param.h index 7a18d82d7..fa6730208 100644 --- a/param.h +++ b/param.h @@ -1508,6 +1508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 16 #ifdef ARCH_X86 From f343ed65b59b04d9757bf10fcc9fec938d9895a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Dec 2018 22:30:29 +0100 Subject: [PATCH 335/935] Avoid taking the root of a negative number Fixes #1924 where numpy 1.17+ would report the (transient) FE_INVALID exception raised for the domain error. --- driver/level3/syrk_thread.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 5f40853dc..b26d363c4 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( BLASLONG width, i; BLASLONG n_from, n_to; - double dnum, nf, nt, di; + double dnum, nf, nt, di, dinum; int num_cpu; int mask = 0; @@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)i; - width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); + dinum = di * di +dnum; + if (dinum <0) + width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1); + else + width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; @@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( nf = (double)(arg -> n - n_from); nt = (double)(arg -> n - n_to); - dnum = (nt * nt - nf * nf) / (double)nthreads; - num_cpu = 0; range[0] = n_from; @@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)(arg -> n - i); - width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); - + dinum = di * di + dnum; + if (dinum<0) + width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1); + else + width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; } else { From 26a3402773050c8fb3c0e633e967fc1a6456fe0b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 12:26:01 +0100 Subject: [PATCH 336/935] Reflect ARMV8 target definition changes from PR1876 and create config target directory for cross-compiles. --- cmake/prebuild.cmake | 114 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 4 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index f29bc3a75..6ed99e807 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -116,10 +116,37 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define L2_LINESIZE\t64\n" "#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_SIZE\t4096\n" - "#define L2_ASSOCIATIVE\t32\n") + "#define L2_ASSOCIATIVE\t32\n" + "#define ARMV8\n") set(SGEMM_UNROLL_M 4) set(SGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA57") + elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t3\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t262144\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 4) + elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t49152\n" "#define L1_CODE_LINESIZE\t64\n" @@ -127,7 +154,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_LINESIZE\t64\n" "#define L1_DATA_ASSOCIATIVE\t2\n" - "#define L2_SIZE\t2097152\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 4) + elseif ("${CORE}" STREQUAL "FALKOR") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t3\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t128\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t524288\n" "#define L2_LINESIZE\t64\n" "#define L2_ASSOCIATIVE\t16\n" "#define DTB_DEFAULT_ENTRIES\t64\n" @@ -135,7 +188,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define HAVE_VFPV4\n" "#define HAVE_VFPV3\n" "#define HAVE_VFP\n" - "#define HAVE_NEON\n") + "#define HAVE_NEON\n" + "#define ARMV8\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 8) @@ -144,6 +198,57 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 4) + elseif ("${CORE}" STREQUAL "THUNDERX) + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t3\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t128\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t167772164\n" + "#define L2_LINESIZE\t128\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + elseif ("${CORE}" STREQUAL "THUNDERX2T99) + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t262144\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t33554432\n" + "#define L3_LINESIZE\t64\n" + "#define L3_ASSOCIATIVE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define VULCAN\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) endif() # Or should this actually be NUM_CORES? @@ -163,6 +268,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS file(APPEND ${TARGET_CONF_TEMP} "#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n") # Move to where gen_config_h would place it + file(MAKE_DIRECTORY ${TARGET_CONF_DIR}) file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}") else(NOT CMAKE_CROSSCOMPILING) From 43c2b0eb5594bbcb0c48882965a6d655b0f99bc5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 17:16:43 +0100 Subject: [PATCH 337/935] Add -mavx2 to TARGET=HASWELL builds to leverage improvements from PR#1921 --- Makefile.x86_64 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index dbee28079..1b7fe3ef4 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -25,6 +25,17 @@ endif endif endif +ifeq ($(CORE), HASWELL) +ifndef DYNAMIC_ARCH +ifndef NO_AVX2 +CCOMMON_OPT += -mavx2 +FCOMMON_OPT += -mavx2 +endif +endif +endif + + + ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 endif From 49e0f485dac263e3b26cff01ed1759e46880e497 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 17:26:09 +0100 Subject: [PATCH 338/935] Add -mavx2 for TARGET=HASWELL if compiler supports and requires it --- cmake/system.cmake | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index d803bb9eb..ba2c4f351 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -45,6 +45,12 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() +if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() +endif() endif() if (DEFINED TARGET) From 76b4b8980f7cec3ad0dde05d3c0ef2f395d04622 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 19:08:19 +0100 Subject: [PATCH 339/935] Use -dumpversion with gcc only --- cmake/system.cmake | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index ba2c4f351..a060d98cb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -42,15 +42,19 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () if (DEFINED TARGET) -if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") -endif() -if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") - endif() -endif() + if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() endif() if (DEFINED TARGET) From 5bd21ab6e1e4da023185c1472877d9806b1d0c48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 23:46:48 +0100 Subject: [PATCH 340/935] Make sure that -fPIC is present when needed override user-provided FFLAGS if necessary --- Makefile.system | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 3987460ec..fb8e7ea41 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1154,8 +1154,6 @@ ifndef FCOMMON_OPT FCOMMON_OPT = -O2 -frecursive endif - - override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) @@ -1163,6 +1161,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = +ifdef NEED_PIC +ifeq (,$(findstring PIC,$(FFLAGS))) +override FFLAGS += -fPIC +endif +endif + #For LAPACK Fortran codes. #Disable -fopenmp for LAPACK Fortran codes on Windows. ifdef OS_WINDOWS From d6818777d1ed7ead02c0d0b448b2d60e783c97f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Dec 2018 23:47:37 +0100 Subject: [PATCH 341/935] Make sure that -fPIC is present if needed --- exports/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 3a5f77db3..5628eacac 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -2,6 +2,12 @@ TOPDIR = .. include ../Makefile.system +ifdef NEED_PIC +ifeq (,$(findstring PIC,$(CFLAGS))) +CFLAGS+= -fPIC +endif +endif + ifndef EXPRECISION EXPRECISION = 0 endif From 795285c587d40c004910ad8cde72abacfe8f5e2a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 24 Dec 2018 18:49:50 +0000 Subject: [PATCH 342/935] Fix thinko in skylake beta handling casting ints is cheaper but it has a rounding, not memory casing effect, resulting in invalid outcome --- kernel/x86_64/dgemm_beta_skylakex.c | 2 +- kernel/x86_64/sgemm_beta_skylakex.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 8c24725a1..5cd001920 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, FLOAT ctemp5, ctemp6, ctemp7, ctemp8; /* fast path.. just zero the whole matrix */ - if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + if (m == ldc && beta == ZERO) { memset(c, 0, m * n * sizeof(FLOAT)); return 0; } diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index cdc9c44be..1c29c1168 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, FLOAT ctemp5, ctemp6, ctemp7, ctemp8; /* fast path.. just zero the whole matrix */ - if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + if (m == ldc && beta == ZERO) { memset(c, 0, m * n * sizeof(FLOAT)); return 0; } From fe02ba86a46699f5bba3a403bbb1e513273bdd53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Dec 2018 20:46:04 +0100 Subject: [PATCH 343/935] Remove unnecessary change again --- exports/Makefile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 5628eacac..3a5f77db3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -2,12 +2,6 @@ TOPDIR = .. include ../Makefile.system -ifdef NEED_PIC -ifeq (,$(findstring PIC,$(CFLAGS))) -CFLAGS+= -fPIC -endif -endif - ifndef EXPRECISION EXPRECISION = 0 endif From 211120c50832f8f338872c891a51b86e291f13b9 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Thu, 27 Dec 2018 23:09:21 +0100 Subject: [PATCH 344/935] Fix typo in UNKNOWN core name Should be of no consequence, right? --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 8e4a7cb84..eb986b6b6 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1649,7 +1649,7 @@ static char *lowercpuname[] = { }; static char *corename[] = { - "UNKOWN", + "UNKNOWN", "80486", "P5", "P6", From 09170268a31a2113c1203e44da54f3129ca572cf Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:33:18 +0100 Subject: [PATCH 345/935] Update cpuid_arm.c --- cpuid_arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_arm.c b/cpuid_arm.c index 2f8959242..19aa90718 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -34,7 +34,7 @@ #define CPU_CORTEXA15 4 static char *cpuname[] = { - "UNKOWN", + "UNKNOWN", "ARMV6", "ARMV7", "CORTEXA9", From 187233953cadbb876477e511c38e6ac95f44feed Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:34:38 +0100 Subject: [PATCH 346/935] Update cpuid_mips.c --- cpuid_mips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index c09902936..6f2932c94 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -75,7 +75,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_1004K 2 static char *cpuname[] = { - "UNKOWN", + "UNKNOWN", "P5600", "1004K" }; From c329de2931fd524be15aba7c7f04336758552459 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:35:41 +0100 Subject: [PATCH 347/935] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d42f9b8c3..21096f893 100644 --- a/Makefile +++ b/Makefile @@ -131,7 +131,7 @@ endif endif libs : -ifeq ($(CORE), UNKOWN) +ifeq ($(CORE), UNKNOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) endif ifeq ($(NOFORTRAN), 1) From 7cbc2c37d64665d221e6db7537354a09809ff2f3 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Fri, 28 Dec 2018 14:36:39 +0100 Subject: [PATCH 348/935] Update cpuid_mips64.c --- cpuid_mips64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips64.c b/cpuid_mips64.c index dcb559a7c..0e32bfc0b 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_I6500 6 static char *cpuname[] = { - "UNKOWN", + "UNKNOWN", "SICORTEX", "LOONGSON3A", "LOONGSON3B", From 93240f489eaf6352f07366c79e62168583f74b98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Dec 2018 18:12:54 +0100 Subject: [PATCH 349/935] Fix wrong case in TARGET setting for Alpine --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4efa23b8d..3f323a854 100644 --- a/.travis.yml +++ b/.travis.yml @@ -117,7 +117,7 @@ matrix: - <<: *test-alpine env: - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" + - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" - &test-cmake os: linux From bba1e672691cd62a2a0607865a2514334f8700e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Dec 2018 21:59:31 +0100 Subject: [PATCH 350/935] Delete the pthread key on cleanup in TLS mode to avoid a crash when OpenBLAS was loaded via dlopen and libc tries to clean up the leaked TLS after dlclose Fixes #1720 --- driver/others/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 36815a39c..6f7a7db82 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1073,6 +1073,11 @@ static volatile int memory_initialized = 0; } free(table); } +#if defined(OS_WINDOWS) + TlsFree(local_storage_key); +#else + pthread_key_delete(local_storage_key); +#endif } static void blas_memory_init(){ From 9f80e0f5fcfe883b5f355d71831bc22880c40271 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Dec 2018 14:39:18 +0100 Subject: [PATCH 351/935] Remove stray include of complex.h already provided conditionally by common.h via openblas_utest.h Unconditional inclusion breaks older Android and similar platforms that use OPENBLAS_COMPLEX_STRUCT --- utest/test_dotu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/utest/test_dotu.c b/utest/test_dotu.c index ef04dd9a8..918541848 100644 --- a/utest/test_dotu.c +++ b/utest/test_dotu.c @@ -32,7 +32,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" -#include CTEST( zdotu,zdotu_n_1) { From 5a720cf9cac5266079c06032fb2ab36da4ed84f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Dec 2018 15:22:37 +0100 Subject: [PATCH 352/935] Re-enable loop unrolling in trmv and remove the scary warning fixes #1748 as that half of the fix for #1332 appears to have been an overreaction on my part. --- driver/level2/trmv_U.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/driver/level2/trmv_U.c b/driver/level2/trmv_U.c index 7f8895e7f..90ffb7370 100644 --- a/driver/level2/trmv_U.c +++ b/driver/level2/trmv_U.c @@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu COPY_K(m, b, incb, buffer, 1); } -/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */ -/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */ + for (is = 0; is < m; is += DTB_ENTRIES){ - for (is = 0; is < m; is += DTB_ENTRIES * 100){ - - min_i = MIN(m - is, DTB_ENTRIES * 100); + min_i = MIN(m - is, DTB_ENTRIES); #ifndef TRANSA - if (is > 0){ -fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n"); + if (is > 0){ GEMV_N(is, min_i, 0, dp1, a + is * lda, lda, B + is, 1, From 0d52aefc6b462db2fcdb9ff800d11b7ba8a4f7ab Mon Sep 17 00:00:00 2001 From: George Hartzell Date: Sun, 30 Dec 2018 14:55:34 -0800 Subject: [PATCH 353/935] Typo: Skyalke -> Skylake Worth fixing, it gets in the way of searching.... --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9ed9be337..26055c745 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. -* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. +* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. From 13d006339b2082ec871b839b73349a2f4645bf83 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:00:46 +0100 Subject: [PATCH 354/935] Update ChangeLog.txt with changes from 0.3.5 --- Changelog.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 0dd17a558..49b26873a 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,36 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.5 +31-Dec-2018 + +common: + * loop unrolling in TRMV has been enabled again. + * A domain error in the thread workload distribution for SYRK + has been fixed. + * gmake builds will now automatically add -fPIC to the build + options if the platform requires it. + * a pthreads key leakage (and associate crash on dlclose) in + the USE_TLS codepath was fixed. + * building of the utest cases on systems that do not provide + an implementation of complex.h was fixed. + +x86_64: + * the SkylakeX code was changed to compile on OSX. + * unwanted application of the -march=skylake-avx512 option + to the common code parts of a DYNAMIC_ARCH build was fixed. + * improved performance of SGEMM for small workloads on Skylake X. + * performance of SGEMM and DGEMM was improved on Haswell. + +ARMV8: + * a configuration error that broke the CNRM2 kernel was corrected. + * compilation of the GEMM kernels with CMAKE was fixed. + * DYNAMIC_ARCH builds are now available with CMAKE as well. + * using CMAKE for cross-compilation to the new cpu TARGETs + introduced in 0.3.4 now works. + +POWER: + * a problem in cpu autodetection for AIX has been corrected. + ==================================================================== Version 0.3.4 02-Dec-2018 From 9185d419d3c0452a898eb44618d47c11c9cd450e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:09:20 +0100 Subject: [PATCH 355/935] Version 0.3.5 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 24c169afe..ac5dd93de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 5.dev) +set(OpenBLAS_PATCH_VERSION 5) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From eebc18928715775c9ed254684edee16e4efe0342 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:09:59 +0100 Subject: [PATCH 356/935] Version 0.3.5 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0d5b83b39..3033455d3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.5.dev +VERSION = 0.3.5 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 2940798ea7efb799d682739e3e5d00985b3efd3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:10:59 +0100 Subject: [PATCH 357/935] Increment version to 0.3.6.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 24c169afe..812e6bf6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 5.dev) +set(OpenBLAS_PATCH_VERSION 6.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From ed704185abd09fe04c6c82cf809c1cb09d359651 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:11:37 +0100 Subject: [PATCH 358/935] Increment version to 0.3.6.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0d5b83b39..7c128fb49 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.5.dev +VERSION = 0.3.6.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d11554c88fdf1b6a9cad1c4c1252f27995117378 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Mon, 31 Dec 2018 23:19:44 +0100 Subject: [PATCH 359/935] Validate user supplied TARGET (#1941) the build will now abort with an error message when an undefined build TARGET is named Fixes #1938 --- Makefile.system | 1 + getarch.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/Makefile.system b/Makefile.system index fb8e7ea41..20d4f6492 100644 --- a/Makefile.system +++ b/Makefile.system @@ -65,6 +65,7 @@ endif ifdef TARGET GETARCH_FLAGS := -DFORCE_$(TARGET) +GETARCH_FLAGS += -DUSER_TARGET endif # Force fallbacks for 32bit diff --git a/getarch.c b/getarch.c index 146f1f36f..78ba0fefd 100644 --- a/getarch.c +++ b/getarch.c @@ -1068,6 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef FORCE +#ifdef USER_TARGET +#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt" +#endif + #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) #ifndef POWER From 20d1aad13f59d6146bcdf8be6716cd8cc020d2bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Jan 2019 20:15:35 +0100 Subject: [PATCH 360/935] Fix missing quotes around thunderx targets --- cmake/prebuild.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6ed99e807..757461008 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -198,7 +198,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "THUNDERX) + elseif ("${CORE}" STREQUAL "THUNDERX") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -224,7 +224,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 2) - elseif ("${CORE}" STREQUAL "THUNDERX2T99) + elseif ("${CORE}" STREQUAL "THUNDERX2T99") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" From 802f0dbde153b166f533ab1660336d7832e5b616 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Jan 2019 22:17:31 +0100 Subject: [PATCH 361/935] More fixes for cross-compiling ARM64 targets Fixed core naming for DYNAMIC_ARCH. Corrected GEMM_DEFAULT entries and added SYMV_P. Replaced outdated VULCAN define for ThunderX2T99 with ARMV8 to get basic definitions back. For issue #1908 --- cmake/prebuild.cmake | 45 ++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 757461008..a67c44bf5 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -87,13 +87,18 @@ endif () # Cannot run getarch on target if we are cross-compiling if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) # Write to config as getarch would + if (DEFINED TARGET_CORE) + set(TCORE ${TARGET_CORE}) + else() + set(TCORE ${CORE}) + endif() # TODO: Set up defines that getarch sets up based on every other target # Perhaps this should be inside a different file as it grows larger file(APPEND ${TARGET_CONF_TEMP} - "#define ${CORE}\n" - "#define CHAR_CORENAME \"${CORE}\"\n") - if ("${CORE}" STREQUAL "ARMV7") + "#define ${TCORE}\n" + "#define CHAR_CORENAME \"${TCORE}\"\n") + if ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_LINESIZE\t32\n" @@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "ARMV8") + elseif ("${TCORE}" STREQUAL "ARMV8") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_LINESIZE\t64\n" @@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define DTB_SIZE\t4096\n" "#define L2_ASSOCIATIVE\t32\n" "#define ARMV8\n") - set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t49152\n" "#define L1_CODE_LINESIZE\t64\n" @@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "FALKOR") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "FALKOR") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" @@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "THUNDERX") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 2) - elseif ("${CORE}" STREQUAL "THUNDERX2T99") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX2T99") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define L3_ASSOCIATIVE\t32\n" "#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_SIZE\t4096\n" - "#define VULCAN\n") + "#define ARMV8\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 8) @@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) endif() # Or should this actually be NUM_CORES? From 1aa840a0a2e52edfe4572e99131c4f19ccc63e58 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 4 Jan 2019 01:38:18 +0200 Subject: [PATCH 362/935] [ZARCH] fix sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index efc06297f..fe99ef5ce 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -158,8 +158,6 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" "vrepf %%v4,%%v0,1 \n\t" "aebr %%f0,%%f4 \n\t" "vrepf %%v4,%%v0,2 \n\t" @@ -351,6 +349,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "vl %%v31,112(%%r1,%1) \n\t" "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" "1: \n\t" "lghi %%r0,28 \n\t" From 94cd946b963e9e077cb4a4c5d93b1ce691e1fe63 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 4 Jan 2019 17:45:56 +0200 Subject: [PATCH 363/935] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 332 +++++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 166 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 4c3253774..c939aea9f 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -34,107 +34,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" @@ -153,56 +153,56 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" "vl %%v0,0(%%r1,%4) \n\t" "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" @@ -222,34 +222,34 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" + "vlrepg %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" #else "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" #endif "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" + "srlg %%r0,%0,1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%1) \n\t" "pfd 2,1024(%%r1,%3) \n\t" - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" "vl %%v0,0(%%r1,%3) \n\t" "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" @@ -268,18 +268,18 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al { __asm__ volatile ( #if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,1 \n\t" "vlef %%v1,%4,3 \n\t" #else "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" + "vlef %%v0,%3,3 \n\t" "vflcsb %%v0,%%v0 \n\t" "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" + "vlef %%v0,%3,2 \n\t" "vlrepf %%v1,%4 \n\t" #endif "xgr %%r1,%%r1 \n\t" @@ -292,7 +292,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al "vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,0(%%r1,%2) \n\t" "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" + "verllg %%v20,%%v16,32 \n\t" "verllg %%v21,%%v17,32 \n\t" "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" From ae1d1f74f7ff96b8345189bcba058b7acdc7d494 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:55:33 +0100 Subject: [PATCH 364/935] Query AVX2 and AVX512 capability for runtime cpu selection --- driver/others/dynamic.c | 141 +++++++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1f67dc521..7cc911d32 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -304,9 +304,47 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 1){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" +#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -403,18 +441,24 @@ static gotoblas_t *get_coretype(void){ } //Intel Haswell if (model == 12 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 13) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -424,27 +468,36 @@ static gotoblas_t *get_coretype(void){ case 4: //Intel Haswell if (model == 5 || model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 7 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -457,40 +510,50 @@ static gotoblas_t *get_coretype(void){ case 5: //Intel Broadwell if (model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } if (model == 5) { // Intel Skylake X -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) return &gotoblas_HASWELL; - else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } -#endif + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Phi Knights Landing if (model == 7) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -503,26 +566,26 @@ static gotoblas_t *get_coretype(void){ case 6: if (model == 6) { // Cannon Lake -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return &gotoblas_HASWELL; -#else - return &gotoblas_SANDYBRIDGE; -#endif - else - return &gotoblas_NEHALEM; -#endif + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } From 0afaae4b2323b28af49ffe81b98d17bd4ced96f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:58:56 +0100 Subject: [PATCH 365/935] Query AVX2 and AVX512VL capability in x86 cpu detection --- common_x86_64.h | 2 +- cpuid.h | 1 + cpuid_x86.c | 132 +++++++++++++++++++++++++++--------------------- 3 files changed, 76 insertions(+), 59 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 62e138e34..f27c1e9be 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op)); + : "0" (op), "c"(0)); #endif } diff --git a/cpuid.h b/cpuid.h index a6bc211f3..c56672ad8 100644 --- a/cpuid.h +++ b/cpuid.h @@ -139,6 +139,7 @@ #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) #define HAVE_AVX512VL (1 << 21) +#define HAVE_AVX2 (1 << 22) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 diff --git a/cpuid_x86.c b/cpuid_x86.c index eb986b6b6..ddc09857b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ ("mov %%ebx, %%edi;" "cpuid;" "xchgl %%ebx, %%edi;" - : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); #else __asm__ __volatile__ - ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); #endif } @@ -211,6 +211,42 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & 32) != 32){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + int get_vendor(void){ int eax, ebx, ecx, edx; @@ -294,6 +330,8 @@ int get_cputype(int gettype){ if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; #ifndef NO_AVX if (support_avx()) feature |= HAVE_AVX; + if (support_avx2()) feature |= HAVE_AVX2; + if (support_avx512()) feature |= HAVE_AVX512VL; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1228,22 +1266,18 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: case 15: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 13: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -1252,33 +1286,27 @@ int get_cpuname(void){ switch (model) { case 5: case 6: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: case 15: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 14: //Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1292,46 +1320,36 @@ int get_cpuname(void){ switch (model) { case 6: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 5: // Skylake X -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif case 14: // Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: // Xeon Phi Knights Landing - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1342,30 +1360,24 @@ int get_cpuname(void){ case 6: switch (model) { case 6: // Cannon Lake -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif } break; case 9: case 8: switch (model) { case 14: // Kaby Lake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -2112,6 +2124,8 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); + if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); + if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2180,6 +2194,8 @@ void get_sse(void){ if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); + if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); + if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); From 68eb3146ce4c50ac557cf5f199cc1b4294ba3817 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:07:14 +0100 Subject: [PATCH 366/935] Add xcr0 (os support) check --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index ddc09857b..377267fcc 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -239,6 +239,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From e1574fa2b4a2a781be70d8d521bb3b80a572ca9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:08:02 +0100 Subject: [PATCH 367/935] Add xcr0 (os support) check --- driver/others/dynamic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7cc911d32..4c966260d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,6 +332,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From 31ed19e8b907f72ed4c8ef3165d8577b55264861 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 19:41:13 +0100 Subject: [PATCH 368/935] Add message for SkylakeX and KNL fallbacks to Haswell --- driver/others/dynamic.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4c966260d..ba93fca8b 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -346,7 +346,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" -#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -526,8 +526,10 @@ static gotoblas_t *get_coretype(void){ // Intel Skylake X if (support_avx512()) return &gotoblas_SKYLAKEX; - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; @@ -550,8 +552,10 @@ static gotoblas_t *get_coretype(void){ } //Intel Phi Knights Landing if (model == 7) { - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; From 191677b902054d1476f3bb12b5360c337c47eb7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 10:46:47 +0100 Subject: [PATCH 369/935] Add travis_wait to the OSX brew install phase --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3f323a854..e8b7e0a27 100644 --- a/.travis.yml +++ b/.travis.yml @@ -153,7 +153,7 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - brew install gcc # for gfortran + - travis_wait 30 brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: From cf5d48e83300a5eb2bb047829fc793ba78959c35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 14:41:48 +0100 Subject: [PATCH 370/935] Update OSX environment to Sierra as homebrew seems to have dropped support for El Capitan in their gcc packages --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3f323a854..51679af62 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,7 +149,7 @@ matrix: - &test-macos os: osx - osx_image: xcode8 + osx_image: xcode8.3 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update From 1650311246d185ca2631c76c33c0212848b57d2a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 14:43:45 +0100 Subject: [PATCH 371/935] Bump xcode to 8.3 --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e8b7e0a27..51679af62 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,11 +149,11 @@ matrix: - &test-macos os: osx - osx_image: xcode8 + osx_image: xcode8.3 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - travis_wait 30 brew install gcc # for gfortran + - brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: From 3eafcfa6507891f7fff781423d9eb6af13501133 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 07:43:45 +0200 Subject: [PATCH 372/935] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index c939aea9f..7b5e43497 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -119,22 +119,22 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "vlef %%v28,0(%%r1,%3),0 \n\t" + "vlef %%v28,0(%%r1,%3),1 \n\t" + "vlef %%v28,8(%%r1,%3),2 \n\t" + "vlef %%v28,8(%%r1,%3),3 \n\t" + "vlef %%v29,4(%%r1,%3),0 \n\t" + "vlef %%v29,4(%%r1,%3),1 \n\t" + "vlef %%v29,12(%%r1,%3),2 \n\t" + "vlef %%v29,12(%%r1,%3),3 \n\t" + "vlef %%v30,0(%%r1,%4),0 \n\t" + "vlef %%v30,0(%%r1,%4),1 \n\t" + "vlef %%v30,8(%%r1,%4),2 \n\t" + "vlef %%v30,8(%%r1,%4),3 \n\t" + "vlef %%v31,4(%%r1,%4),0 \n\t" + "vlef %%v31,4(%%r1,%4),1 \n\t" + "vlef %%v31,12(%%r1,%4),2 \n\t" + "vlef %%v31,12(%%r1,%4),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" From e7455f500c06ecda4085d560ffa20c5bc188416f Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:33:54 +0200 Subject: [PATCH 373/935] [ZARCH] fix dsdot.c --- kernel/zarch/dsdot.c | 123 ++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 67 deletions(-) diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 17461a029..800bb0d51 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -27,61 +27,34 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { double dot; __asm__ volatile ( "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" + "srlg %%r0,%1,4 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmsb %%v16,%%v16,%%v24 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmsb %%v17,%%v17,%%v25 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmsb %%v18,%%v18,%%v26 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmsb %%v19,%%v19,%%v27 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmsb %%v20,%%v20,%%v28 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmsb %%v21,%%v21,%%v29 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmsb %%v22,%%v22,%%v30 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmsb %%v23,%%v23,%%v31 \n\t" - - "vflls %%v24,%%v16 \n\t" - "vflls %%v25,%%v17 \n\t" - "vflls %%v26,%%v18 \n\t" - "vflls %%v27,%%v19 \n\t" - "vflls %%v28,%%v20 \n\t" - "vflls %%v29,%%v21 \n\t" - "vflls %%v30,%%v22 \n\t" - "vflls %%v31,%%v23 \n\t" - - "veslg %%v16,%%v16,32 \n\t" - "veslg %%v17,%%v17,32 \n\t" - "veslg %%v18,%%v18,32 \n\t" - "veslg %%v19,%%v19,32 \n\t" - "veslg %%v20,%%v20,32 \n\t" - "veslg %%v21,%%v21,32 \n\t" - "veslg %%v22,%%v22,32 \n\t" - "veslg %%v23,%%v23,32 \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v16,4(%%r1,%2),2 \n\t" + "vlef %%v17,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),2 \n\t" + "vlef %%v18,16(%%r1,%2),0 \n\t" + "vlef %%v18,20(%%r1,%2),2 \n\t" + "vlef %%v19,24(%%r1,%2),0 \n\t" + "vlef %%v19,28(%%r1,%2),2 \n\t" + "vlef %%v20,32(%%r1,%2),0 \n\t" + "vlef %%v20,36(%%r1,%2),2 \n\t" + "vlef %%v21,40(%%r1,%2),0 \n\t" + "vlef %%v21,44(%%r1,%2),2 \n\t" + "vlef %%v22,48(%%r1,%2),0 \n\t" + "vlef %%v22,52(%%r1,%2),2 \n\t" + "vlef %%v23,56(%%r1,%2),0 \n\t" + "vlef %%v23,60(%%r1,%2),2 \n\t" "vflls %%v16,%%v16 \n\t" "vflls %%v17,%%v17 \n\t" @@ -92,24 +65,40 @@ static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "vflls %%v22,%%v22 \n\t" "vflls %%v23,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v24 \n\t" - "vfadb %%v17,%%v17,%%v25 \n\t" - "vfadb %%v18,%%v18,%%v26 \n\t" - "vfadb %%v19,%%v19,%%v27 \n\t" - "vfadb %%v20,%%v20,%%v28 \n\t" - "vfadb %%v21,%%v21,%%v29 \n\t" - "vfadb %%v22,%%v22,%%v30 \n\t" - "vfadb %%v23,%%v23,%%v31 \n\t" - "vfadb %%v16,%%v16,%%v20 \n\t" - "vfadb %%v17,%%v17,%%v21 \n\t" - "vfadb %%v18,%%v18,%%v22 \n\t" - "vfadb %%v19,%%v19,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v18 \n\t" - "vfadb %%v17,%%v17,%%v19 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v0,%%v16,%%v0 \n\t" - - "agfi %%r1,128 \n\t" + "vlef %%v24,0(%%r1,%3),0 \n\t" + "vlef %%v24,4(%%r1,%3),2 \n\t" + "vflls %%v24,%%v24 \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vlef %%v25,8(%%r1,%3),0 \n\t" + "vlef %%v25,12(%%r1,%3),2 \n\t" + "vflls %%v25,%%v25 \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + "vlef %%v26,16(%%r1,%3),0 \n\t" + "vlef %%v26,20(%%r1,%3),2 \n\t" + "vflls %%v26,%%v26 \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + "vlef %%v27,24(%%r1,%3),0 \n\t" + "vlef %%v27,28(%%r1,%3),2 \n\t" + "vflls %%v27,%%v27 \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + "vlef %%v28,32(%%r1,%3),0 \n\t" + "vlef %%v28,36(%%r1,%3),2 \n\t" + "vflls %%v28,%%v28 \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + "vlef %%v29,40(%%r1,%3),0 \n\t" + "vlef %%v29,44(%%r1,%3),2 \n\t" + "vflls %%v29,%%v29 \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + "vlef %%v30,48(%%r1,%3),0 \n\t" + "vlef %%v30,52(%%r1,%3),2 \n\t" + "vflls %%v30,%%v30 \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + "vlef %%v31,56(%%r1,%3),0 \n\t" + "vlef %%v31,60(%%r1,%3),2 \n\t" + "vflls %%v31,%%v31 \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,64 \n\t" "brctg %%r0,0b \n\t" "vrepg %%v1,%%v0,1 \n\t" "adbr %%f0,%%f1 \n\t" @@ -134,10 +123,10 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 ) - dot = dsdot_kernel_32(n1,x,y); + dot = dsdot_kernel_16(n1,x,y); i = n1; while(i < n) From c2ffef81569624cc530d515bbaac9890d819253b Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:49:44 +0200 Subject: [PATCH 374/935] [ZARCH] fix data prefetch type in ddot --- kernel/zarch/ddot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f34d1e96e..ff4c347a6 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -37,7 +37,7 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From be66f5d5c21b558dd1ef35dc8f4bda6b544b4f79 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:50:07 +0200 Subject: [PATCH 375/935] [ZARCH] fix data prefetch type in sdot --- kernel/zarch/sdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index fd8c8e445..5ddbc69bd 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -37,7 +37,7 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From ad2c386d6ad99d3021e33cbbfb311150b2586c93 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jan 2019 00:32:50 +0100 Subject: [PATCH 376/935] Move TLS key deletion to openblas_quit fixes #1954 (as suggested by thrasibule in that issue) --- driver/others/memory.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 6f7a7db82..72d3e173c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1073,11 +1073,6 @@ static volatile int memory_initialized = 0; } free(table); } -#if defined(OS_WINDOWS) - TlsFree(local_storage_key); -#else - pthread_key_delete(local_storage_key); -#endif } static void blas_memory_init(){ @@ -1491,6 +1486,14 @@ void DESTRUCTOR gotoblas_quit(void) { blas_shutdown(); +#if defined(SMP) +#if defined(OS_WINDOWS) + TlsFree(local_storage_key); +#else + pthread_key_delete(local_storage_key); +#endif +#endif + #ifdef PROFILE moncontrol (0); #endif From 67432b23c2fe7f8ef29cf85821278dcdf69b4db2 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 16:44:46 +0200 Subject: [PATCH 377/935] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 7b5e43497..a45c3d687 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, ap[3] = ap[2] + lda; x_ptr = x; //zero_y(NB,ybuffer); - memset(ybuffer,0,NB*16); + memset(ybuffer,0,NB*8); if ( inc_x == 2 ) { From 5d89d6b143ea770e4dcb2336319b543f2297c6ba Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:08:24 +0200 Subject: [PATCH 378/935] [ZARCH] fix sgemv_n_4.c --- kernel/zarch/sgemv_n_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 92019d732..01d8414de 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -435,7 +435,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ap[3] = ap[2] + lda; if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); + memset(ybuffer,0,NB*4); else ybuffer = y_ptr; @@ -465,8 +465,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From ecc31b743fc93d3b5951e83e6e37148dbdd381c8 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:13:02 +0200 Subject: [PATCH 379/935] Update dgemv_t_4.c --- kernel/zarch/dgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index f9c1f966d..2d8fa0d10 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -601,9 +601,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From b731e8246f9fad13637005be39d8566111bab9fe Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:14:04 +0200 Subject: [PATCH 380/935] Update sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index fe99ef5ce..5515d7bb7 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -605,9 +605,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From 621dedb37bd1d33c7006c305b4057bb0cc7ea7cd Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:37:11 +0200 Subject: [PATCH 381/935] [ZARCH] Update cgemv_t_4.c --- kernel/zarch/cgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 89914fb1f..0dd43057c 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { From 406f835f00fedcfef894742b30a7f48905836eee Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:39:17 +0200 Subject: [PATCH 382/935] [ZARCH] update cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index a45c3d687..ed81325e1 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { From 1a7925b3a335114d26bd1d25d6f6fdc2743909b6 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:43:11 +0200 Subject: [PATCH 383/935] [ZARCH] Update dgemv_n_4.c --- kernel/zarch/dgemv_n_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca6d287bc..ca4fd6170 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -488,8 +488,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From 00401489c2d82e1dd997f91480fe6bc441cd6b40 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Jan 2019 22:38:32 +0100 Subject: [PATCH 384/935] Fix missing braces in support_avx() --- cpuid_x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 377267fcc..74cc6655b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -216,7 +216,7 @@ int support_avx2(){ int eax, ebx, ecx=0, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 0) @@ -232,7 +232,7 @@ int support_avx512(){ int eax, ebx, ecx, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & 32) != 32){ From dbc9a060ef4d6ba08b21352f22bb2fa989db0919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Jan 2019 22:41:31 +0100 Subject: [PATCH 385/935] Fix missing braces in support_av() call --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ba93fca8b..9e59da2cc 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -309,7 +309,7 @@ int support_avx2(){ int eax, ebx, ecx=0, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 0) @@ -325,7 +325,7 @@ int support_avx512(){ int eax, ebx, ecx, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 1){ From b815a04c87e49a01e66e1c41ce4654f8d7817f83 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 15 Jan 2019 21:04:22 +0200 Subject: [PATCH 386/935] [ZARCH] fix a bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/damax.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 3506c4e9b..2c913b62e 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 726747b99..733f98fbf 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index b74af5d37..236d11c72 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 4cf5e88b1..c2c63c6c5 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index de38bd21a..469f65735 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index d7c86735f..3df504950 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index d1f135369..4f7ff6985 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 679606a8f..3abc7a558 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 5de41ac7b..313a88db4 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 7fec111cf..42443215b 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index d2686c0cd..dd2144db2 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 768f31a8c..d7e44421d 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 8fc32adf6..1ebc6c8c8 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 415052810..a6b9d59de 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 1025cfcbf..61d50159f 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 3b8f03e6a..a585a79ff 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 33798eb7c..bcdb473af 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e882b7ff1..91c31d284 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 937bc9753..8ef3f42ca 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 8564edaf4..30fd1d030 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From 29dc72889f5c0544aee8bc5f2dee98603cbfec36 Mon Sep 17 00:00:00 2001 From: caiyu Date: Wed, 16 Jan 2019 14:25:19 +0800 Subject: [PATCH 387/935] Add support for Hygon Dhyana --- cpuid.h | 5 ++++ cpuid_x86.c | 54 +++++++++++++++++++++++++++++++++++++---- driver/others/dynamic.c | 11 ++++++++- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/cpuid.h b/cpuid.h index c56672ad8..697f43133 100644 --- a/cpuid.h +++ b/cpuid.h @@ -53,6 +53,7 @@ #define VENDOR_SIS 8 #define VENDOR_TRANSMETA 9 #define VENDOR_NSC 10 +#define VENDOR_HYGON 11 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -116,6 +117,7 @@ #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 #define CORE_SKYLAKEX 28 +#define CORE_DHYANA 29 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -215,5 +217,8 @@ typedef struct { #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 #define CPUTYPE_SKYLAKEX 52 +#define CPUTYPE_DHYANA 53 + +#define CPUTYPE_HYGON_UNKNOWN 54 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 74cc6655b..726014033 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -271,6 +271,7 @@ int get_vendor(void){ if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; + if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -1046,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ } } - if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { + if ((get_vendor() == VENDOR_AMD) || + (get_vendor() == VENDOR_HYGON) || + (get_vendor() == VENDOR_CENTAUR)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); LDTB.size = 4096; @@ -1483,6 +1486,26 @@ int get_cpuname(void){ return CPUTYPE_AMD_UNKNOWN; } + if (vendor == VENDOR_HYGON){ + switch (family) { + case 0xf: + switch (exfamily) { + case 9: + //Hygon Dhyana + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_ZEN; +#else + return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CPUTYPE_BARCELONA; + } + break; + } + return CPUTYPE_HYGON_UNKNOWN; + } + if (vendor == VENDOR_CYRIX){ switch (family) { case 0x4: @@ -1604,7 +1627,8 @@ static char *cpuname[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", - "SKYLAKEX" + "SKYLAKEX", + "DHYANA" }; static char *lowercpuname[] = { @@ -1659,7 +1683,8 @@ static char *lowercpuname[] = { "steamroller", "excavator", "zen", - "skylakex" + "skylakex", + "dhyana" }; static char *corename[] = { @@ -1691,7 +1716,8 @@ static char *corename[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", - "SKYLAKEX" + "SKYLAKEX", + "DHYANA" }; static char *corename_lower[] = { @@ -1723,7 +1749,8 @@ static char *corename_lower[] = { "steamroller", "excavator", "zen", - "skylakex" + "skylakex", + "dhyana" }; @@ -2040,6 +2067,23 @@ int get_coretype(void){ } } + if (vendor == VENDOR_HYGON){ + if (family == 0xf){ + if (exfamily == 9) { + if(support_avx()) +#ifndef NO_AVX2 + return CORE_ZEN; +#else + return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CORE_BARCELONA; + } else { + return CORE_BARCELONA; + } + } + } + if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9e59da2cc..99c9254ac 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define VENDOR_INTEL 1 #define VENDOR_AMD 2 #define VENDOR_CENTAUR 3 +#define VENDOR_HYGON 4 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -369,6 +370,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -604,7 +606,7 @@ static gotoblas_t *get_coretype(void){ } } - if (vendor == VENDOR_AMD){ + if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ if (family <= 0xe) { // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon cpuid(0x80000000, &eax, &ebx, &ecx, &edx); @@ -684,6 +686,13 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } + } else if (exfamily == 9) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } }else { return &gotoblas_BARCELONA; } From b70fd238366c6a822c7f1766ab125f64c67a6b39 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:18:54 +0200 Subject: [PATCH 389/935] disable NaN checks before BLAS calls dsolve.R --- benchmark/scripts/R/dsolve.R | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index a3fb78da7..6f1b8ef7b 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,6 +2,10 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) { + options(matprod = "blas") +} + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +23,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,31 +30,23 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) - B <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) + B <- matrix(rnorm(n * n), nrow = n) z <- system.time(for (l in 1:loops) { solve(A, B) }) - mflops <- - (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } + From 2777a7f506308550e37f7ef26ce05f53a0d096ef Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:23:51 +0200 Subject: [PATCH 390/935] disable NaN checks before BLAS calls dsolve.R (shorter config part) --- benchmark/scripts/R/dsolve.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index 6f1b8ef7b..ad2045900 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,9 +2,7 @@ argv <- commandArgs(trailingOnly = TRUE) -if (!is.null(options("matprod")[[1]])) { - options(matprod = "blas") -} +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") nfrom <- 128 nto <- 2048 @@ -42,11 +40,10 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep } - From 7af8b21dbbb523b0e9ab6caff271cb63affaa5f2 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:34:46 +0200 Subject: [PATCH 391/935] disable NaN checks before BLAS calls dsolve.R (shorter formula) --- benchmark/scripts/R/dsolve.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index ad2045900..46301570b 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -40,7 +40,7 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) + mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) From 3afceb6c2a220ff61878c9a328846cc723de42ed Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:38:14 +0200 Subject: [PATCH 392/935] disable NaN checks before BLAS calls deig.R --- benchmark/scripts/R/deig.R | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index ece727fb3..32716471b 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,14 +28,7 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom @@ -45,11 +39,10 @@ while (n <= nto) { ev <- eigen(A) }) - mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 478d3c4569cd4957bbef779423ee7e51686b5c0a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:41:46 +0200 Subject: [PATCH 393/935] disable NaN checks before BLAS calls deig.R (shorten matrix def) --- benchmark/scripts/R/deig.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index 32716471b..c6d541dcf 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -33,7 +33,7 @@ cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) ev <- 0 z <- system.time(for (l in 1:loops) { ev <- eigen(A) From 3e601bd4195b24568eb4f7db2402ba3258fd82cc Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:54:22 +0200 Subject: [PATCH 394/935] disable NaN checks before BLAS calls dgemm.R --- benchmark/scripts/R/dgemm.R | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R index 75297dfb8..d7c3e8108 100755 --- a/benchmark/scripts/R/dgemm.R +++ b/benchmark/scripts/R/dgemm.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,26 +28,13 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) - B <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) + A <- matrix(runif(n * n), nrow = n) + B <- matrix(runif(n * n), nrow = n) C <- 1 z <- system.time(for (l in 1:loops) { @@ -54,11 +42,10 @@ while (n <= nto) { l <- l + 1 }) - mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 8c3386be8780bdf631ffebe085fde2591d4cd062 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 16 Jan 2019 15:16:21 +0000 Subject: [PATCH 395/935] Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin}, Fixed idamin,icamin choosing the first occurance index of equal minimals --- kernel/power/KERNEL.POWER8 | 20 +-- kernel/power/caxpy.c | 145 +++++++++++++++++++ kernel/power/cdot.c | 164 +++++++++++++++++++++ kernel/power/crot.c | 213 +++++++++++++++++++++++++++ kernel/power/icamax.c | 261 +++++++++++++++++++++++++++++++++ kernel/power/icamin.c | 266 ++++++++++++++++++++++++++++++++++ kernel/power/idamin.c | 50 +++---- kernel/power/isamax.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/isamin.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/izamin.c | 26 ++-- kernel/power/saxpy.c | 129 +++++++++++++++++ 11 files changed, 1802 insertions(+), 48 deletions(-) create mode 100644 kernel/power/caxpy.c create mode 100644 kernel/power/cdot.c create mode 100644 kernel/power/crot.c create mode 100644 kernel/power/icamax.c create mode 100644 kernel/power/icamin.c create mode 100644 kernel/power/isamax.c create mode 100644 kernel/power/isamin.c create mode 100644 kernel/power/saxpy.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 00ff8682a..cbcffb8fe 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -#ISAMAXKERNEL = ../arm/iamax.c +ISAMAXKERNEL = isamax.c IDAMAXKERNEL = idamax.c -#ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = izamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c # -#ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = idamin.c -#ICAMINKERNEL = ../arm/izamin.c +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # -#SAXPYKERNEL = ../arm/axpy.c +SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -#CAXPYKERNEL = ../arm/zaxpy.c +CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c @@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -#CDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = ../arm/zrot.c +CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c new file mode 100644 index 000000000..4bdf13c34 --- /dev/null +++ b/kernel/power/caxpy.c @@ -0,0 +1,145 @@ +/* +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#ifndef HAVE_ASM_KERNEL +#include +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) +{ + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; + register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; + +#else + register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; + register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; +#endif + + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i=0; + for (; i < n/2; i += 8) { + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vy_4 = vy[i + 4]; + register __vector float vy_5 = vy[i + 5]; + register __vector float vy_6 = vy[i + 6]; + register __vector float vy_7 = vy[i + 7]; + register __vector float vx_0 = vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + register __vector float vx_4 = vx[i + 4]; + register __vector float vx_5 = vx[i + 5]; + register __vector float vx_6 = vx[i + 6]; + register __vector float vx_7 = vx[i + 7]; + vy_0 += vx_0*valpha_r; + vy_1 += vx_1*valpha_r; + vy_2 += vx_2*valpha_r; + vy_3 += vx_3*valpha_r; + vy_4 += vx_4*valpha_r; + vy_5 += vx_5*valpha_r; + vy_6 += vx_6*valpha_r; + vy_7 += vx_7*valpha_r; + vx_0 = vec_perm(vx_0, vx_0, swap_mask); + vx_1 = vec_perm(vx_1, vx_1, swap_mask); + vx_2 = vec_perm(vx_2, vx_2, swap_mask); + vx_3 = vec_perm(vx_3, vx_3, swap_mask); + vx_4 = vec_perm(vx_4, vx_4, swap_mask); + vx_5 = vec_perm(vx_5, vx_5, swap_mask); + vx_6 = vec_perm(vx_6, vx_6, swap_mask); + vx_7 = vec_perm(vx_7, vx_7, swap_mask); + vy_0 += vx_0*valpha_i; + vy_1 += vx_1*valpha_i; + vy_2 += vx_2*valpha_i; + vy_3 += vx_3*valpha_i; + vy_4 += vx_4*valpha_i; + vy_5 += vx_5*valpha_i; + vy_6 += vx_6*valpha_i; + vy_7 += vx_7*valpha_i; + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + vy[i + 4] = vy_4; + vy[i + 5] = vy_5 ; + vy[i + 6] = vy_6 ; + vy[i + 7] = vy_7 ; + + } +} +#endif +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + if (n <= 0) return (0); + if ((inc_x == 1) && (inc_y == 1)) { + BLASLONG n1 = n & -16; + if (n1) { + caxpy_kernel_16(n1, x, y, da_r,da_i); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + } + return (0); + + } + inc_x *= 2; + inc_y *= 2; + while (i < n) { +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} + diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c new file mode 100644 index 000000000..f86a33f22 --- /dev/null +++ b/kernel/power/cdot.c @@ -0,0 +1,164 @@ +/*Copyright (c) 2013-201\n8, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#ifndef HAVE_KERNEL_8 +#include +static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) +{ + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i = 0; + register __vector float vd_0 = { 0 }; + register __vector float vd_1 = { 0 }; + register __vector float vd_2 = { 0 }; + register __vector float vd_3 = { 0 }; + register __vector float vdd_0 = { 0 }; + register __vector float vdd_1 = { 0 }; + register __vector float vdd_2 = { 0 }; + register __vector float vdd_3 = { 0 }; + for (; i < n/2; i += 4) { + + register __vector float vyy_0 ; + register __vector float vyy_1 ; + register __vector float vyy_2 ; + register __vector float vyy_3 ; + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vx_0= vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + vyy_0 = vec_perm(vy_0, vy_0, swap_mask); + vyy_1 = vec_perm(vy_1, vy_1, swap_mask); + vyy_2 = vec_perm(vy_2, vy_2, swap_mask); + vyy_3 = vec_perm(vy_3, vy_3, swap_mask); + + vd_0 += vx_0 * vy_0; + vd_1 += vx_1 * vy_1; + vd_2 += vx_2 * vy_2; + vd_3 += vx_3 * vy_3; + + vdd_0 += vx_0 * vyy_0; + vdd_1 += vx_1 * vyy_1; + vdd_2 += vx_2 * vyy_2; + vdd_3 += vx_3 * vyy_3; + + + } + //aggregate + vd_0 = vd_0 + vd_1 +vd_2 +vd_3; + vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; + //reverse and aggregate + vd_1=vec_xxpermdi(vd_0,vd_0,2) ; + vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); + vd_2=vd_0+vd_1; + vdd_2=vdd_0+vdd_1; + + dot[0]=vd_2[0]; + dot[1]=vd_2[1]; + dot[2]=vdd_2[0]; + dot[3]=vdd_2[1]; + +} +#endif + + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix=0, iy=0; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -8; + BLASLONG j=0; + + if (n1){ + cdot_kernel_8(n1, x, y, dot); + i = n1; + j = n1 <<1; + } + + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} diff --git a/kernel/power/crot.c b/kernel/power/crot.c new file mode 100644 index 000000000..7e04a09e8 --- /dev/null +++ b/kernel/power/crot.c @@ -0,0 +1,213 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "ble 2f \n\t" + ".p2align 5 \n\t" + "1: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "bgt 1b \n\t" + "2: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT temp; + if ( n <= 0 ) return(0); + if ( (inc_x == 1) && (inc_y == 1) ) + { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x1, y1, c, s); + i=n1; + } + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + i++ ; + } + + } + else + { + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + return(0); +} + diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c new file mode 100644 index 000000000..aa0531dc6 --- /dev/null +++ b/kernel/power/icamax.c @@ -0,0 +1,261 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c new file mode 100644 index 000000000..36432c993 --- /dev/null +++ b/kernel/power/icamin.c @@ -0,0 +1,266 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index f4d1d1bdb..7fe0f8a33 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -89,10 +89,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { ".p2align 5 \n\t" "1: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -103,8 +103,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -125,7 +125,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -139,7 +139,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -162,10 +162,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //<-----------jump here from first load "2: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -176,8 +176,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" "xxsel 0 ,0,1,2 \n\t" "xxsel 34,34,35,3 \n\t" @@ -194,7 +194,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -210,7 +210,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -238,10 +238,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //============================================================================== - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -252,8 +252,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" @@ -264,14 +264,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" "vaddudm 1,1,5 \n\t" // get real index for first smaller //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -284,7 +284,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c new file mode 100644 index 000000000..bf1af78d6 --- /dev/null +++ b/kernel/power/isamax.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c new file mode 100644 index 000000000..1c1f0ad78 --- /dev/null +++ b/kernel/power/isamin.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 448247ffd..1ffa3ba8b 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -101,8 +101,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -114,7 +114,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -126,7 +126,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -166,8 +166,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -179,7 +179,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -191,7 +191,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -235,15 +235,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "xxsel 32,40,41,50 \n\t" "xxsel 0,46,47,50 \n\t" "xxsel 33,42,43,51 \n\t" "xxsel 1,48,49,51 \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "xxsel 32,32,33,2 \n\t" "xxsel 3,0,1,2 \n\t" @@ -252,7 +252,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -267,7 +267,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c new file mode 100644 index 000000000..393cdfadc --- /dev/null +++ b/kernel/power/saxpy.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + + + +#ifndef HAVE_KERNEL_8 +#include + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG i = 0; + __vector float v_a = {alpha,alpha,alpha,alpha}; + __vector float * v_y=(__vector float *)y; + __vector float * v_x=(__vector float *)x; + + for(; i Date: Thu, 17 Jan 2019 14:45:31 +0000 Subject: [PATCH 396/935] crot fix --- kernel/power/crot.c | 90 +++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 7e04a09e8..40e350ba3 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -55,7 +55,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "lxvd2x 51, %[i48], %[y_ptr] \n\t" "addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t" - "addic. %[temp_n], %[temp_n], -16 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" "ble 2f \n\t" ".p2align 5 \n\t" "1: \n\t" @@ -103,7 +103,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" "addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t" - "addic. %[temp_n], %[temp_n], -16 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" "bgt 1b \n\t" "2: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x @@ -173,41 +173,59 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT *x1=x; - FLOAT *y1=y; - FLOAT temp; - if ( n <= 0 ) return(0); - if ( (inc_x == 1) && (inc_y == 1) ) - { - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - crot_kernel_8(n1, x1, y1, c, s); - i=n1; - } - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - i++ ; - } + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; - } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; - ix += inc_x ; - iy += inc_y ; - i++ ; - } - } + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x, y, c, s); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + } return(0); } From 3e9fd6359dabb1c9c8ce3fa5e980e94a3536d2c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 16:19:03 +0100 Subject: [PATCH 397/935] Bump xcode version to 10.1 to make sure it handles AVX512 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 51679af62..ec5dc8a9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,7 +149,7 @@ matrix: - &test-macos os: osx - osx_image: xcode8.3 + osx_image: xcode10.1 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update From d5e6940253b2ee638509de283b8b1d7695fefbbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 23:20:32 +0100 Subject: [PATCH 398/935] Fix declaration of input arguments in the x86_64 microkernels for DOT and AXPY (#1965) * Tag operands 0 and 1 as both input and output For #1964 (basically a continuation of coding problems first seen in #1292) --- kernel/x86_64/caxpy_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/caxpy_microk_haswell-2.c | 6 +++--- kernel/x86_64/caxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/caxpy_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_haswell-2.c | 6 +++--- kernel/x86_64/cdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/cdot_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/daxpy_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/daxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/daxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/daxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/ddot_microk_haswell-2.c | 6 +++--- kernel/x86_64/ddot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/ddot_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/ddot_microk_steamroller-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/saxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/saxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/sdot_microk_haswell-2.c | 8 ++++---- kernel/x86_64/sdot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/sdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/zaxpy_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_haswell-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_steamroller-2.c | 16 ++++++++-------- 37 files changed, 202 insertions(+), 202 deletions(-) diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 33bda0943..ca2209340 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c index 00e2e6a42..b605ea34c 100644 --- a/kernel/x86_64/caxpy_microk_haswell-2.c +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c index a798fd977..72d37afed 100644 --- a/kernel/x86_64/caxpy_microk_sandy-2.c +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c index 87370b032..7ca7af070 100644 --- a/kernel/x86_64/caxpy_microk_steamroller-2.c +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c index f587aa036..118655913 100644 --- a/kernel/x86_64/cdot_microk_bulldozer-2.c +++ b/kernel/x86_64/cdot_microk_bulldozer-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c index fe195a63b..8b9d6d104 100644 --- a/kernel/x86_64/cdot_microk_haswell-2.c +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c index 01816917d..fe142c38f 100644 --- a/kernel/x86_64/cdot_microk_sandy-2.c +++ b/kernel/x86_64/cdot_microk_sandy-2.c @@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c index 76a3aa0eb..7350b21c9 100644 --- a/kernel/x86_64/cdot_microk_steamroller-2.c +++ b/kernel/x86_64/cdot_microk_steamroller-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c index 8c520dcf1..9c1305b97 100644 --- a/kernel/x86_64/daxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index bbe8b9550..f3682e6d7 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c index 943d893af..8feb9f26c 100644 --- a/kernel/x86_64/daxpy_microk_nehalem-2.c +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c index 95eb953b4..4b83124c7 100644 --- a/kernel/x86_64/daxpy_microk_piledriver-2.c +++ b/kernel/x86_64/daxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c index 85e038cef..db9a45de8 100644 --- a/kernel/x86_64/daxpy_microk_sandy-2.c +++ b/kernel/x86_64/daxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c index e40009037..8e63fcc1d 100644 --- a/kernel/x86_64/daxpy_microk_steamroller-2.c +++ b/kernel/x86_64/daxpy_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c index 9756ee46a..5590c5b17 100644 --- a/kernel/x86_64/ddot_microk_bulldozer-2.c +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index 365737363..dbb5487f7 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c index fb5ec9bca..e5e234e22 100644 --- a/kernel/x86_64/ddot_microk_nehalem-2.c +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c index ac950885c..cc4bcd90a 100644 --- a/kernel/x86_64/ddot_microk_piledriver-2.c +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c index 160f95604..84493ec27 100644 --- a/kernel/x86_64/ddot_microk_sandy-2.c +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c index 5ce20b5de..27d5244ce 100644 --- a/kernel/x86_64/ddot_microk_steamroller-2.c +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 3a743d64c..7099ba4c6 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c index 68f68ea3a..88bbb695d 100644 --- a/kernel/x86_64/saxpy_microk_nehalem-2.c +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c index 204cf8bac..5feea7f24 100644 --- a/kernel/x86_64/saxpy_microk_piledriver-2.c +++ b/kernel/x86_64/saxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c index 0a6bef046..0d448d5f8 100644 --- a/kernel/x86_64/saxpy_microk_sandy-2.c +++ b/kernel/x86_64/saxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c index 36e61b077..8958a33dc 100644 --- a/kernel/x86_64/sdot_microk_bulldozer-2.c +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index df367b61f..91dc928d3 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 1a27177f5..5a715d008 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c index ca13536f2..ae25d5a50 100644 --- a/kernel/x86_64/sdot_microk_sandy-2.c +++ b/kernel/x86_64/sdot_microk_sandy-2.c @@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c index 6b8b2566b..bf6a5f287 100644 --- a/kernel/x86_64/sdot_microk_steamroller-2.c +++ b/kernel/x86_64/sdot_microk_steamroller-2.c @@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 0e15761f7..15d367971 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c index 30e8b1955..89d23daf3 100644 --- a/kernel/x86_64/zaxpy_microk_haswell-2.c +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c index 233af143a..17b8b24f7 100644 --- a/kernel/x86_64/zaxpy_microk_sandy-2.c +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c index 728d09213..907b1ae00 100644 --- a/kernel/x86_64/zaxpy_microk_steamroller-2.c +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c index 30a9552d6..db9a48cce 100644 --- a/kernel/x86_64/zdot_microk_bulldozer-2.c +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 11056a3c1..9f2fc2c1d 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c index 87c5b0340..33415e26e 100644 --- a/kernel/x86_64/zdot_microk_sandy-2.c +++ b/kernel/x86_64/zdot_microk_sandy-2.c @@ -107,10 +107,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -199,10 +199,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c index 325f74ae3..87138fe9a 100644 --- a/kernel/x86_64/zdot_microk_steamroller-2.c +++ b/kernel/x86_64/zdot_microk_steamroller-2.c @@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 From b495e54310a99049c50c20425269f4b026b47dbb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:07 +0100 Subject: [PATCH 399/935] Fix declaration of input arguments in the x86_64 SCAL microkernels (#1966) * Tag arguments 0 and 1 as both input and output (see #1964) --- kernel/x86_64/cscal_microk_bulldozer-2.c | 32 +++++++++++----------- kernel/x86_64/cscal_microk_haswell-2.c | 30 ++++++++++---------- kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++---- kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++---- kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++---- kernel/x86_64/zscal_microk_bulldozer-2.c | 28 +++++++++---------- kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++----------- kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- 9 files changed, 111 insertions(+), 111 deletions(-) diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c index 3abffc4cf..31451aa6c 100644 --- a/kernel/x86_64/cscal_microk_bulldozer-2.c +++ b/kernel/x86_64/cscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c index 0a4eb683c..a04a4c4ab 100644 --- a/kernel/x86_64/cscal_microk_haswell-2.c +++ b/kernel/x86_64/cscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", // "0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" - : - : - "r" (n), // 0 - "r" (x), // 1 + : + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c index 8346e1748..e8073d485 100644 --- a/kernel/x86_64/cscal_microk_steamroller-2.c +++ b/kernel/x86_64/cscal_microk_steamroller-2.c @@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c index de53b0bc4..096662781 100644 --- a/kernel/x86_64/dscal_microk_bulldozer-2.c +++ b/kernel/x86_64/dscal_microk_bulldozer-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c index e732a2718..77ed59a4e 100644 --- a/kernel/x86_64/dscal_microk_haswell-2.c +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c index 8d855072b..9982b8e58 100644 --- a/kernel/x86_64/dscal_microk_sandy-2.c +++ b/kernel/x86_64/dscal_microk_sandy-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c index 03882d6b6..5e733ffda 100644 --- a/kernel/x86_64/zscal_microk_bulldozer-2.c +++ b/kernel/x86_64/zscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c index d9253c1ed..8c8f5b75c 100644 --- a/kernel/x86_64/zscal_microk_haswell-2.c +++ b/kernel/x86_64/zscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c index 97b07add6..c9267ee0c 100644 --- a/kernel/x86_64/zscal_microk_steamroller-2.c +++ b/kernel/x86_64/zscal_microk_steamroller-2.c @@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", From 32b0f1168ec5eb93e146245d732c5a2fa9d73282 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:39 +0100 Subject: [PATCH 400/935] Fix declaration of input arguments in the Sandybridge GER microkernels (#1967) * Tag arguments 0 and 1 as both input and output --- kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c index 2bf966a5f..e8494500f 100644 --- a/kernel/x86_64/dger_microk_sandy-2.c +++ b/kernel/x86_64/dger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c index 79180b991..14f13475b 100644 --- a/kernel/x86_64/sger_microk_sandy-2.c +++ b/kernel/x86_64/sger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 From cda81cfae0e3dc18b1c2e9d05d6e0f8e1bec3917 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 00:10:01 +0100 Subject: [PATCH 401/935] Shift transition to multithreading towards larger matrix sizes See #1886 and JuliaRobotics issue 500. trsm benchmarks on Haswell and Zen showed that with these values performance is roughly doubled for matrix sizes between 8x8 and 14x14, and still 10 to 20 percent better near the new cutoff at 32x32. --- interface/trsm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index 5c2750e79..faec03ac2 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -81,6 +81,12 @@ #endif #endif +#ifndef COMPLEX +#define SMP_FACTOR 8 +#else +#define SMP_FACTOR 4 +#endif + static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef TRMM TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, @@ -366,10 +372,10 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From bbfdd6c0fe1e7d90099fe14f1e1f2fd775a47a36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 23:01:31 +0100 Subject: [PATCH 402/935] Increase Zen SWITCH_RATIO to 16 following GEMM benchmarks on Ryzen2700X. For #1464 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index fa6730208..15ea663a8 100644 --- a/param.h +++ b/param.h @@ -605,7 +605,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 16 #ifdef ARCH_X86 From 83b5c6b92dc6f66becae1418beef60042eb92c6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Jan 2019 12:18:53 +0100 Subject: [PATCH 403/935] Fix compilation with NO_AVX=1 set fixes #1974 --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 726014033..c45ddd968 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -228,7 +228,7 @@ int support_avx2(){ } int support_avx512(){ -#ifndef NO_AVX512 +#if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From b111829226874550c524b36882ff84c90008f494 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 21 Jan 2019 15:56:04 +0200 Subject: [PATCH 404/935] [ZARCH] Update max/min functions --- kernel/zarch/camax.c | 162 +++++++++++++++++-------------------- kernel/zarch/camin.c | 180 +++++++++++++++++++----------------------- kernel/zarch/damax.c | 108 ++++++++----------------- kernel/zarch/damin.c | 110 ++++++++------------------ kernel/zarch/dmax.c | 89 ++++++++------------- kernel/zarch/dmin.c | 89 ++++++++------------- kernel/zarch/icamax.c | 33 ++++---- kernel/zarch/icamin.c | 31 ++++---- kernel/zarch/idamax.c | 51 ++++++------ kernel/zarch/idamin.c | 51 ++++++------ kernel/zarch/idmax.c | 51 ++++++------ kernel/zarch/idmin.c | 51 ++++++------ kernel/zarch/isamax.c | 55 +++++++------ kernel/zarch/isamin.c | 55 +++++++------ kernel/zarch/ismax.c | 55 +++++++------ kernel/zarch/ismin.c | 55 +++++++------ kernel/zarch/izamax.c | 27 ++++--- kernel/zarch/izamin.c | 27 ++++--- kernel/zarch/samax.c | 111 ++++++++------------------ kernel/zarch/samin.c | 111 ++++++++------------------ kernel/zarch/smax.c | 92 ++++++++------------- kernel/zarch/smin.c | 92 ++++++++------------- kernel/zarch/zamax.c | 118 +++++++++++++-------------- kernel/zarch/zamin.c | 118 +++++++++++++-------------- 24 files changed, 805 insertions(+), 1117 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2c913b62e..66d250896 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -55,7 +55,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%2) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" @@ -93,100 +93,88 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),1 \n\t" - "vlef %%v17,140(%%r1,%2),1 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),1 \n\t" - "vlef %%v19,172(%%r1,%2),1 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),1 \n\t" - "vlef %%v21,204(%%r1,%2),1 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),1 \n\t" - "vlef %%v23,236(%%r1,%2),1 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 733f98fbf..5abc685b2 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -43,8 +43,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) __asm__ volatile ( "vlef %%v0,0(%2),0 \n\t" "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),0 \n\t" - "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" "vlef %%v0,16(%2),2 \n\t" "vlef %%v16,20(%2),2 \n\t" "vlef %%v0,24(%2),3 \n\t" @@ -59,8 +59,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" "vlef %%v16,16(%%r1,%2),2 \n\t" "vlef %%v17,20(%%r1,%2),2 \n\t" "vlef %%v16,24(%%r1,%2),3 \n\t" @@ -68,8 +68,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v18,32(%%r1,%2),0 \n\t" "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),0 \n\t" - "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" "vlef %%v18,48(%%r1,%2),2 \n\t" "vlef %%v19,52(%%r1,%2),2 \n\t" "vlef %%v18,56(%%r1,%2),3 \n\t" @@ -77,8 +77,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v20,64(%%r1,%2),0 \n\t" "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),0 \n\t" - "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" "vlef %%v20,80(%%r1,%2),2 \n\t" "vlef %%v21,84(%%r1,%2),2 \n\t" "vlef %%v20,88(%%r1,%2),3 \n\t" @@ -86,107 +86,95 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,96(%%r1,%2),0 \n\t" "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),0 \n\t" - "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" "vlef %%v22,112(%%r1,%2),2 \n\t" "vlef %%v23,116(%%r1,%2),2 \n\t" "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),0 \n\t" - "vlef %%v17,140(%%r1,%2),0 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),0 \n\t" - "vlef %%v19,172(%%r1,%2),0 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),0 \n\t" - "vlef %%v21,204(%%r1,%2),0 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),0 \n\t" - "vlef %%v23,236(%%r1,%2),0 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" + + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 236d11c72..a3d63fe53 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -39,8 +39,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) FLOAT amax; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -54,79 +53,42 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" + + "vfmaxdb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index c2c63c6c5..738ed8710 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -39,11 +39,10 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) FLOAT amin; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%2) \n\t" @@ -54,79 +53,42 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v24,8 \n\t" + "vfmindb %%v17,%%v17,%%v25,8 \n\t" + "vfmindb %%v18,%%v18,%%v26,8 \n\t" + "vfmindb %%v19,%%v19,%%v27,8 \n\t" + "vfmindb %%v20,%%v20,%%v28,8 \n\t" + "vfmindb %%v21,%%v21,%%v29,8 \n\t" + "vfmindb %%v22,%%v22,%%v30,8 \n\t" + "vfmindb %%v23,%%v23,%%v31,8 \n\t" + + "vfmindb %%v16,%%v16,%%v20,8 \n\t" + "vfmindb %%v17,%%v17,%%v21,8 \n\t" + "vfmindb %%v18,%%v18,%%v22,8 \n\t" + "vfmindb %%v19,%%v19,%%v23,8 \n\t" + + "vfmindb %%v16,%%v16,%%v18,8 \n\t" + "vfmindb %%v17,%%v17,%%v19,8 \n\t" + + "vfmindb %%v16,%%v16,%%v17,8 \n\t" + + "vfmindb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 469f65735..aa8b932f9 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -32,7 +32,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) FLOAT max; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" + + "vfmaxdb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 3df504950..8ae5fe868 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -32,7 +32,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) FLOAT min; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v17,%%v17,%%v25,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v19,%%v19,%%v27,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v21,%%v21,%%v29,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" + "vfmindb %%v23,%%v23,%%v31,0 \n\t" + + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v17,%%v17,%%v21,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" + "vfmindb %%v19,%%v19,%%v23,0 \n\t" + + "vfmindb %%v16,%%v16,%%v18,0 \n\t" + "vfmindb %%v17,%%v17,%%v19,0 \n\t" + + "vfmindb %%v16,%%v16,%%v17,0 \n\t" + + "vfmindb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 9b4077c6b..27f969eee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -76,7 +76,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%3) \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" "vlef %%v16,0(%%r1,%3),0 \n\t" "vlef %%v17,4(%%r1,%3),0 \n\t" @@ -127,14 +127,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 6e952a325..ae7b37b4f 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -127,14 +127,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4f7ff6985..e5a1d3a7c 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -63,7 +63,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3abc7a558..a68f7282f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -63,7 +63,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 313a88db4..4c3040779 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -55,7 +55,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 42443215b..ba1776a49 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -55,7 +55,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index dd2144db2..2f5c1c867 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -81,7 +81,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index d7e44421d..04e05aad9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -81,7 +81,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 1ebc6c8c8..084b4ce94 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -73,7 +73,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index a6b9d59de..4e85816a3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -73,7 +73,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 541464b05..2ffad2570 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -93,21 +93,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 4b5572b80..1e037c0c7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -93,21 +93,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 61d50159f..c8d831d06 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -40,8 +40,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" + + "vfmaxsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index a585a79ff..dd24c74d7 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -40,8 +40,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v24,8 \n\t" + "vfminsb %%v17,%%v17,%%v25,8 \n\t" + "vfminsb %%v18,%%v18,%%v26,8 \n\t" + "vfminsb %%v19,%%v19,%%v27,8 \n\t" + "vfminsb %%v20,%%v20,%%v28,8 \n\t" + "vfminsb %%v21,%%v21,%%v29,8 \n\t" + "vfminsb %%v22,%%v22,%%v30,8 \n\t" + "vfminsb %%v23,%%v23,%%v31,8 \n\t" + + "vfminsb %%v16,%%v16,%%v20,8 \n\t" + "vfminsb %%v17,%%v17,%%v21,8 \n\t" + "vfminsb %%v18,%%v18,%%v22,8 \n\t" + "vfminsb %%v19,%%v19,%%v23,8 \n\t" + + "vfminsb %%v16,%%v16,%%v18,8 \n\t" + "vfminsb %%v17,%%v17,%%v19,8 \n\t" + + "vfminsb %%v16,%%v16,%%v17,8 \n\t" + + "vfminsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index bcdb473af..8a2b86dc1 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -33,7 +33,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" + + "vfmaxsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index 91c31d284..b87ec0fe8 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -33,7 +33,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v17,%%v17,%%v25,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v19,%%v19,%%v27,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v21,%%v21,%%v29,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" + "vfminsb %%v23,%%v23,%%v31,0 \n\t" + + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v17,%%v17,%%v21,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" + "vfminsb %%v19,%%v19,%%v23,0 \n\t" + + "vfminsb %%v16,%%v16,%%v18,0 \n\t" + "vfminsb %%v17,%%v17,%%v19,0 \n\t" + + "vfminsb %%v16,%%v16,%%v17,0 \n\t" + + "vfminsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8ef3f42ca..8175874c0 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -69,76 +69,66 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 30fd1d030..5d57ff12e 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -69,76 +69,66 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" + + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmindb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From 63bbd7b0d79d41da2a7cc81139a62b81fa247640 Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Mon, 21 Jan 2019 08:35:23 +0200 Subject: [PATCH 405/935] Better support for MSVC/Windows in CMake --- CMakeLists.txt | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f..8f3abe4b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,19 @@ endif() ####### +if(MSVC AND MSVC_STATIC_CRT) + set(CompilerFlags + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + ) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() +endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -149,12 +162,6 @@ if (${DYNAMIC_ARCH}) endforeach() endif () -# Only build shared libs for MSVC -if (MSVC) - set(BUILD_SHARED_LIBS ON) -endif() - - # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) @@ -314,7 +321,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NOFORTRAN) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) + set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") @@ -327,10 +334,11 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) + file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) From f0d834b824fd5723c5cd8df01ed1aaa7a78548c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 12:32:24 +0100 Subject: [PATCH 406/935] Use VERSION_LESS for comparisons involving software version numbers --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f3abe4b8..afd9d2cf2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,7 +147,7 @@ endif () # Only generate .def for dll on MSVC and always produce pdb files for debug and release if(MSVC) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") endif() set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") @@ -173,7 +173,7 @@ endif() # Handle MSVC exports if(MSVC AND BUILD_SHARED_LIBS) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) From 24288803b3cde043bc4c10d82080509989680efb Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Tue, 22 Jan 2019 14:38:01 +0200 Subject: [PATCH 407/935] Adjust test script for correct deployment --- appveyor.yml | 2 +- utest/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 141d3a130..95f6cf7c5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1b426afe7..dc306501f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -61,7 +61,7 @@ foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_CURRENT_BINARY_DIR}) endforeach() -if (MSVC) +if (MSVC AND BUILD_SHARED_LIBS) add_custom_command(TARGET ${OpenBLAS_utest_bin} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. From 21eda8b5774aa92aecb9babba0b3eda0a992ddb9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:47:12 +0100 Subject: [PATCH 408/935] Report SkylakeX as Haswell if compiler does not support AVX512 ... or make was invoked with NO_AVX512=1 --- getarch.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/getarch.c b/getarch.c index 78ba0fefd..d03ce6e98 100644 --- a/getarch.c +++ b/getarch.c @@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#else +#define NO_AVX512 +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "skylakex" #define CORENAME "SKYLAKEX" #endif +#endif #ifdef FORCE_ATOM #define FORCE From b56b34a75cf3ae253cf8904416c6716406aad1fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:55:43 +0100 Subject: [PATCH 409/935] Syntax fix --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 95f6cf7c5..741c66291 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. From 8533aca96470d361cc5cc81da329190811951df1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Jan 2019 10:03:00 +0100 Subject: [PATCH 410/935] Avoid penalizing tall skinny matrices --- interface/trsm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index faec03ac2..f2da285de 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -82,9 +82,9 @@ #endif #ifndef COMPLEX -#define SMP_FACTOR 8 +#define SMP_FACTOR 256 #else -#define SMP_FACTOR 4 +#define SMP_FACTOR 128 #endif static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { @@ -372,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) +/* + if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; +*/ + if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) + args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From e908ac2a5145ac1a0d43e6baf39df14ade061d57 Mon Sep 17 00:00:00 2001 From: Edison Gustavo Muenz Date: Wed, 23 Jan 2019 15:09:13 +0100 Subject: [PATCH 411/935] Fix include directory of exported targets --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f..d3a9a2797 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,7 +157,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) From e882b239aa75090c7871d5848a0ead7d37bafb6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 00:45:45 +0100 Subject: [PATCH 412/935] Correct naming of getrf_parallel object fixes #1984 --- lapack/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index c0a7543ca..d48a270ab 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -63,7 +63,6 @@ if (USE_THREAD) # these do not have 'z' versions set(PARALLEL_SOURCES - ${GETRF_SRC} lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c potrf/potrf_U_parallel.c @@ -81,6 +80,10 @@ if (USE_THREAD) trtri/trtri_L_parallel.c ) + foreach (float_type ${FLOAT_TYPES}) + GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type}) + endforeach() + GenerateNamedObjects("${PARALLEL_SOURCES}") endif () From 36b844af889374934a4c5af19cf371cf29731d2e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:47:22 +0100 Subject: [PATCH 413/935] Change ARMV8 target to ARMV7 when BINARY32 is set fixes #1961 --- Makefile.system | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20d4f6492..67c8cd197 100644 --- a/Makefile.system +++ b/Makefile.system @@ -95,6 +95,9 @@ endif ifeq ($(TARGET), ZEN) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), ARMV8) +GETARCH_FLAGS := -DFORCE_ARMV7 +endif endif From 58dd7e4501ad55ca03ae1da783de72cc36345f61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:52:33 +0100 Subject: [PATCH 414/935] Change ARMV8 target to ARMV7 for BINARY=32 --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index a060d98cb..4cee7bd18 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + set(TARGET "ARMV7") + endif () endif () if (DEFINED TARGET) From 0f24b39ebf8945ddbe5d1516123e98b62853f5b4 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sun, 27 Jan 2019 15:33:00 +0100 Subject: [PATCH 415/935] Reword/expand comments in Makefile.rule Lots of small changes in the wording of the comments, plus an expansion of the NUM_THREADS and NO_AFFINITY sections. --- Makefile.rule | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 7c128fb49..1d5dcacaa 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -48,6 +48,8 @@ VERSION = 0.3.6.dev # HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 +# Please note that AVX is not available on 32-bit. +# Setting BINARY=32 disables AVX/AVX2/AVX-512. # BINARY=64 # About threaded BLAS. It will be automatically detected if you don't @@ -57,7 +59,7 @@ VERSION = 0.3.6.dev # USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. -# This flag is always set for POWER8. Don't modify the flag +# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. # USE_OPENMP = 1 # The OpenMP scheduler to use - by default this is "static" and you @@ -68,36 +70,39 @@ VERSION = 0.3.6.dev # allow you to select the scheduler from the environment variable OMP_SCHEDULE # CCOMMON_OPT += -DOMP_SCHED=dynamic -# You can define maximum number of threads. Basically it should be -# less than actual number of cores. If you don't specify one, it's +# You can define the maximum number of threads. Basically it should be less +# than or equal to the number of CPU threads. If you don't specify one, it's # automatically detected by the the script. +# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to +# restrict NUM_THREADS to the number of physical cores. By default, the automatic +# detection includes logical CPUs, thus allowing the use of SMT. # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call -# OpenBLAS's calculation API from multi threads, please comment it in. -# This flag defines how many instances of OpenBLAS's calculation API can -# actually run in parallel. If more threads call OpenBLAS's calculation API, +# OpenBLAS's calculation API from multiple threads, please comment this in. +# This flag defines how many instances of OpenBLAS's calculation API can actually +# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API, # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 -# if you don't need to install the static library, please comment it in. +# If you don't need to generate the static library, please comment this in. # NO_STATIC = 1 -# if you don't need generate the shared library, please comment it in. +# If you don't need to generate the shared library, please comment this in. # NO_SHARED = 1 -# If you don't need CBLAS interface, please comment it in. +# If you don't need the CBLAS interface, please comment this in. # NO_CBLAS = 1 -# If you only want CBLAS interface without installing Fortran compiler, -# please comment it in. +# If you only want the CBLAS interface without installing a Fortran compiler, +# please comment this in. # ONLY_CBLAS = 1 -# If you don't need LAPACK, please comment it in. -# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. +# If you don't need LAPACK, please comment this in. +# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1. # NO_LAPACK = 1 -# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. +# If you don't need LAPACKE (C Interface to LAPACK), please comment this in. # NO_LAPACKE = 1 # Build LAPACK Deprecated functions since LAPACK 3.6.0 @@ -106,7 +111,7 @@ BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 -# If you want to use legacy threaded Level 3 implementation. +# If you want to use the legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 # If you want to use the new, still somewhat experimental code that uses @@ -116,8 +121,8 @@ BUILD_LAPACK_DEPRECATED = 1 # USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran -# compiler supports this. It's safe to keep comment it out if you -# are not sure(equivalent to "-i8" option). +# compilers support this. It's safe to keep this commented out if you +# are not sure. (This is equivalent to the "-i8" ifort option). # INTERFACE64 = 1 # Unfortunately most of kernel won't give us high quality buffer. @@ -125,10 +130,15 @@ BUILD_LAPACK_DEPRECATED = 1 # but it will consume time. If you don't like it, you can disable one. NO_WARMUP = 1 -# If you want to disable CPU/Memory affinity on Linux. +# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling. +# This feature is only implemented on Linux, and is always disabled on other platforms. +# Enabling affinity handling may improve performance, especially on NUMA systems, but +# it may conflict with certain applications that also try to manage affinity. +# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing +# else modifies affinity settings. NO_AFFINITY = 1 -# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus +# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers From ea1716ce2aaa4edf09e837796026ecd6cae9116b Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sun, 27 Jan 2019 17:22:26 +0100 Subject: [PATCH 416/935] Update Makefile.rule Revert generate to install, explain the nature of the affinity conflict --- Makefile.rule | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 1d5dcacaa..faf34c0a1 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -85,7 +85,7 @@ VERSION = 0.3.6.dev # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 -# If you don't need to generate the static library, please comment this in. +# If you don't need to install the static library, please comment this in. # NO_STATIC = 1 # If you don't need to generate the shared library, please comment this in. @@ -134,6 +134,8 @@ NO_WARMUP = 1 # This feature is only implemented on Linux, and is always disabled on other platforms. # Enabling affinity handling may improve performance, especially on NUMA systems, but # it may conflict with certain applications that also try to manage affinity. +# This conflict can result in threads of the application calling OpenBLAS ending up locked +# to the same core(s) as OpenBLAS, possibly binding all threads to a single core. # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing # else modifies affinity settings. NO_AFFINITY = 1 From c8ef9fb22064dc6cb1c7515ad8d7e25c7adf9a8a Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:16:18 +0200 Subject: [PATCH 417/935] [ZARCH] Fix bug in iamax/iamin/imax/imin --- kernel/zarch/icamax.c | 1 + kernel/zarch/icamin.c | 1 + kernel/zarch/idamax.c | 1 + kernel/zarch/idamin.c | 1 + kernel/zarch/idmax.c | 1 + kernel/zarch/idmin.c | 1 + kernel/zarch/isamax.c | 1 + kernel/zarch/isamin.c | 1 + kernel/zarch/ismax.c | 1 + kernel/zarch/ismin.c | 1 + kernel/zarch/izamax.c | 1 + kernel/zarch/izamin.c | 1 + 12 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee..96cb37a1d 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4f..73bd9e8de 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a7c..4a0114242 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f7282f..503f92ff7 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c3040779..871c896e6 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a49..dd14ec92c 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867..1a9ac3cd8 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad9..5a7e669eb 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94..0b144c200 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a3..7fda9dffc 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad2570..7db64181c 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c7..707d702d3 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; From 04873bb174d45a9cac478d7db7fd6f2618df2e81 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:32:24 +0200 Subject: [PATCH 418/935] [ZARCH] Undo the last commit --- kernel/zarch/icamax.c | 1 - kernel/zarch/icamin.c | 1 - kernel/zarch/idamax.c | 1 - kernel/zarch/idamin.c | 1 - kernel/zarch/idmax.c | 1 - kernel/zarch/idmin.c | 1 - kernel/zarch/isamax.c | 1 - kernel/zarch/isamin.c | 1 - kernel/zarch/ismax.c | 1 - kernel/zarch/ismin.c | 1 - kernel/zarch/izamax.c | 1 - kernel/zarch/izamin.c | 1 - 12 files changed, 12 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 96cb37a1d..27f969eee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 73bd9e8de..ae7b37b4f 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4a0114242..e5a1d3a7c 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 503f92ff7..a68f7282f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 871c896e6..4c3040779 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index dd14ec92c..ba1776a49 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 1a9ac3cd8..2f5c1c867 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 5a7e669eb..04e05aad9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0b144c200..084b4ce94 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 7fda9dffc..4e85816a3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 7db64181c..2ffad2570 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 707d702d3..1e037c0c7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; From c7143c1019d7a35f94454e2ac811cd948a41d22e Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:52:23 +0200 Subject: [PATCH 419/935] [ZARCH] Fix iamax/imax single precision --- kernel/zarch/icamax.c | 2 ++ kernel/zarch/icamin.c | 2 ++ kernel/zarch/isamax.c | 2 ++ kernel/zarch/isamin.c | 2 ++ kernel/zarch/ismax.c | 2 ++ kernel/zarch/ismin.c | 2 ++ 6 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee..2d1442ad9 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -248,6 +248,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4f..79aa6d341 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -248,6 +248,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867..6e0aaa162 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -216,6 +216,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad9..266c48f7f 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -216,6 +216,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94..c968ce6fa 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -192,6 +192,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a3..0145b31b3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -192,6 +192,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" From dc4d3bccd5ee7de7bb823aa0bb7008a04bcc21d4 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 03:47:49 +0200 Subject: [PATCH 420/935] [ZARCH] Fix icamax/icamin --- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 2d1442ad9..113c0cef5 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -94,7 +94,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 79aa6d341..5096b641b 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -94,7 +94,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" From fcd814a8d292b7712a4230d9b9a20f0f2ce0fe52 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 17:59:38 +0200 Subject: [PATCH 421/935] [ZARCH] Fix bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 66d250896..f6fa772ac 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = camax_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 5abc685b2..4bd6ca17d 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = camin_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 113c0cef5..a9e7f91fc 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = icamax_kernel_32(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 5096b641b..faf5f9c65 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = icamin_kernel_32(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad2570..2d1cc2365 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = izamax_kernel_16(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c7..676fd7c6d 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = izamin_kernel_16(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8175874c0..b7214783f 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = zamax_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 5d57ff12e..d53fdb6b8 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = zamin_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else From eaf20f0e7ac8c2ab53deeb78f959bebb2a49cddd Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 09:26:50 +0200 Subject: [PATCH 422/935] Remove ztest --- ztest/Makefile | 437 ---------------------------------- ztest/amax.c | 235 ------------------ ztest/amin.c | 235 ------------------ ztest/asum.c | 263 -------------------- ztest/axpy.c | 303 ----------------------- ztest/copy.c | 291 ----------------------- ztest/dot.c | 296 ----------------------- ztest/dsdot.c | 229 ------------------ ztest/gemv.c | 633 ------------------------------------------------- ztest/iamax.c | 284 ---------------------- ztest/iamin.c | 284 ---------------------- ztest/imax.c | 231 ------------------ ztest/imin.c | 231 ------------------ ztest/max.c | 229 ------------------ ztest/min.c | 229 ------------------ ztest/rot.c | 303 ----------------------- ztest/scal.c | 308 ------------------------ ztest/swap.c | 306 ------------------------ 18 files changed, 5327 deletions(-) delete mode 100644 ztest/Makefile delete mode 100644 ztest/amax.c delete mode 100644 ztest/amin.c delete mode 100644 ztest/asum.c delete mode 100644 ztest/axpy.c delete mode 100644 ztest/copy.c delete mode 100644 ztest/dot.c delete mode 100644 ztest/dsdot.c delete mode 100644 ztest/gemv.c delete mode 100644 ztest/iamax.c delete mode 100644 ztest/iamin.c delete mode 100644 ztest/imax.c delete mode 100644 ztest/imin.c delete mode 100644 ztest/max.c delete mode 100644 ztest/min.c delete mode 100644 ztest/rot.c delete mode 100644 ztest/scal.c delete mode 100644 ztest/swap.c diff --git a/ztest/Makefile b/ztest/Makefile deleted file mode 100644 index 0ff7fe46a..000000000 --- a/ztest/Makefile +++ /dev/null @@ -1,437 +0,0 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto - -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dsdot #################################################### -dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMAX ############################################## -samax.goto : samax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMAX ############################################## -damax.goto : damax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMAX ############################################## -ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMAX ############################################## -idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMAX ############################################## -smax.goto : smax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMAX ############################################## -dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMIN ############################################## -isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMIN ############################################## -idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMIN ############################################## -icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMIN ############################################## -izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMIN ############################################## -samin.goto : samin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMIN ############################################## -damin.goto : damin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMIN ############################################## -ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMIN ############################################## -idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMIN ############################################## -smin.goto : smin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMIN ############################################## -dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Crot #################################################### -crot.goto : crot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zrot #################################################### -zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -################################################################################################### - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -dsdot.$(SUFFIX) : dsdot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ismax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -isamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ismin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -crot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -clean :: - @rm -f *.goto - diff --git a/ztest/amax.c b/ztest/amax.c deleted file mode 100644 index f2e3f5411..000000000 --- a/ztest/amax.c +++ /dev/null @@ -1,235 +0,0 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef AMAX -#ifdef DOUBLE -#define AMAX BLASFUNC(damax) -#else -#define AMAX BLASFUNC(samax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef AMIN -#ifdef DOUBLE -#define AMIN BLASFUNC(damin) -#else -#define AMIN BLASFUNC(samin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); - - inc_x2 = 2 * inc_x; - - n *= inc_x2; - while(i < n) - { - sumf += CABS1(x,i); - i += inc_x2; - } - return(sumf); -} -#else -FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - return(sumf); -} -#endif - -#undef ASUM -#ifdef COMPLEX -#ifdef DOUBLE -#define ASUM BLASFUNC(dzasum) -#else -#define ASUM BLASFUNC(scasum) -#endif -#else -#ifdef DOUBLE -#define ASUM BLASFUNC(dasum) -#else -#define ASUM BLASFUNC(sasum) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - if ( da_r == 0.0 && da_i == 0.0 ) return(0); - - ix = 0; - iy = 0; - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { -#if !defined(CONJ) - y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; - y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; -#else - y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; - y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); - -} -#else -int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - - if ( n < 0 ) return(0); - if ( da == 0.0 ) return(0); - - ix = 0; - iy = 0; - - while(i < n) - { - - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef AXPY -#ifdef COMPLEX -#ifdef DOUBLE -#define AXPY BLASFUNC(zaxpy) -#else -#define AXPY BLASFUNC(caxpy) -#endif -#else -#ifdef DOUBLE -#define AXPY BLASFUNC(daxpy) -#else -#define AXPY BLASFUNC(saxpy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c;; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - argc--;argv++; - - blasint iy; - int test = 1; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2; - iy += inc_y2; - i++ ; - - } - return(0); - -} -#else -int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - if ( n < 0 ) return(0); - - while(i < n) - { - - y[iy] = x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef COPY -#ifdef COMPLEX -#ifdef DOUBLE -#define COPY BLASFUNC(zcopy) -#else -#define COPY BLASFUNC(ccopy) -#endif -#else -#ifdef DOUBLE -#define COPY BLASFUNC(dcopy) -#else -#define COPY BLASFUNC(scopy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot[2]; - OPENBLAS_COMPLEX_FLOAT result; - BLASLONG inc_x2; - BLASLONG inc_y2; - - dot[0]=0.0; - dot[1]=0.0; - - CREAL(result) = 0.0 ; - CIMAG(result) = 0.0 ; - - if ( n < 1 ) return(result); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { -#if !defined(CONJ) - dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; - dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; -#else - dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; - dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - CREAL(result) = dot[0]; - CIMAG(result) = dot[1]; - return(result); - -} -#else -FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} -#endif - -#undef DOT -#ifdef COMPLEX -#ifdef DOUBLE -#define DOT BLASFUNC(zdotu) -#else -#define DOT BLASFUNC(cdotu) -#endif -#else -#ifdef DOUBLE -#define DOT BLASFUNC(ddot) -#else -#define DOT BLASFUNC(sdot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; -#ifdef COMPLEX - OPENBLAS_COMPLEX_FLOAT result, result_c; -#else - FLOAT result, result_c; -#endif - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - double dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} - -#undef DSDOT -#define DSDOT BLASFUNC(dsdot) - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; - double result, result_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i; - BLASLONG ix, iy; - BLASLONG j; - FLOAT *a_ptr; - FLOAT temp_r, temp_i; - BLASLONG inc_x2, inc_y2; - BLASLONG lda2; - BLASLONG i2; - - lda2 = 2 * lda; - - ix = 0; - a_ptr = a; - - if (inc_x == 1 && inc_y == 1) - { - - for (j = 0; jtv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y, *y_c; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 0.0}; - char trans='N'; - blasint m, i, j; - blasint inc_x=1,inc_y=1; - blasint n=0; - int has_param_n = 0; - int has_param_m = 0; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint y_size; - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - - int tomax = to; - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - if ((p = getenv("OPENBLAS_PARAM_N"))) { - n = atoi(p); - if ((n>0)) has_param_n = 1; - if ( n > tomax ) tomax = n; - } - if ( has_param_n == 0 ) - if ((p = getenv("OPENBLAS_PARAM_M"))) { - m = atoi(p); - if ((m>0)) has_param_m = 1; - if ( m > tomax ) tomax = m; - } - - - - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - if (has_param_m == 0) - { - - for(m = from; m <= to; m += step) - { - timeg=0; - timeg_c=0; - if ( has_param_n == 0 ) n = m; - fprintf(stderr, " %6dx%d :", (int)m,(int)n); - for(j = 0; j < m; j++){ - for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf; - BLASLONG max=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(max+1); -} -#else -BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - max = i; - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(max+1); -} -#endif - -#undef IAMAX -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMAX BLASFUNC(izamax) -#else -#define IAMAX BLASFUNC(icamax) -#endif -#else -#ifdef DOUBLE -#define IAMAX BLASFUNC(idamax) -#else -#define IAMAX BLASFUNC(isamax) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(min); - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(min+1); -} -#else -BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - min = i; - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(min+1); -} -#endif - -#undef IAMIN -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMIN BLASFUNC(izamin) -#else -#define IAMIN BLASFUNC(icamin) -#endif -#else -#ifdef DOUBLE -#define IAMIN BLASFUNC(idamin) -#else -#define IAMIN BLASFUNC(isamin) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - max = i; - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(max+1); -} - -#undef IMAX -#ifdef DOUBLE -#define IMAX BLASFUNC(idmax) -#else -#define IMAX BLASFUNC(ismax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - min = i; - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(min+1); -} - -#undef IMIN -#ifdef DOUBLE -#define IMIN BLASFUNC(idmin) -#else -#define IMIN BLASFUNC(ismin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef MAX_ -#ifdef DOUBLE -#define MAX_ BLASFUNC(dmax) -#else -#define MAX_ BLASFUNC(smax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef MIN_ -#ifdef DOUBLE -#define MIN_ BLASFUNC(dmin) -#else -#define MIN_ BLASFUNC(smin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef ROT -#ifdef COMPLEX -#ifdef DOUBLE -#define ROT BLASFUNC(zdrot) -#else -#define ROT BLASFUNC(csrot) -#endif -#else -#ifdef DOUBLE -#define ROT BLASFUNC(drot) -#else -#define ROT BLASFUNC(srot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - // FLOAT result; - blasint m, i; - blasint inc_x=1,inc_y=1; - FLOAT c[1] = { 2.0 }; - FLOAT s[1] = { 2.0 }; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; - - if ( (n <= 0) || (inc_x <= 0)) - return(0); - - inc_x2 = 2 * inc_x; - for ( i=0; itv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *x_c; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n < 0 ) return(0); - - while(i < n) - { - - temp = x[ix] ; - x[ix] = y[iy] ; - y[iy] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef SWAP -#ifdef COMPLEX -#ifdef DOUBLE -#define SWAP BLASFUNC(zswap) -#else -#define SWAP BLASFUNC(cswap) -#endif -#else -#ifdef DOUBLE -#define SWAP BLASFUNC(dswap) -#else -#define SWAP BLASFUNC(sswap) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l Date: Thu, 31 Jan 2019 15:25:15 +0100 Subject: [PATCH 423/935] Fix wrong comparison that made IMIN identical to IMAX as suggested in #1990 --- kernel/arm/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index 598cba387..ffc65226e 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 86a824c97f1f4ccfe8b24678dc0fdaf4846a7055 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 15:27:21 +0100 Subject: [PATCH 424/935] Fix wrong comparison that made IMIN identical to IMAX as reported by aarnez in #1990 --- kernel/mips/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c index d9b283d2d..bf130613b 100644 --- a/kernel/mips/imin.c +++ b/kernel/mips/imin.c @@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 48b9b94f7f7d1856babac7f20f7e9d90fa8750d0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 18:52:11 +0200 Subject: [PATCH 425/935] [ZARCH] Improve loading performance for camax/icamax --- kernel/zarch/camax.c | 128 ++++++++++++++++++------------------------ kernel/zarch/camin.c | 128 ++++++++++++++++++------------------------ kernel/zarch/icamax.c | 114 ++++++++++++++++--------------------- kernel/zarch/icamin.c | 114 ++++++++++++++++--------------------- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 6 files changed, 212 insertions(+), 276 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index f6fa772ac..2e9648640 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -52,82 +52,66 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" - - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" - - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" - - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" - - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" - - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" - - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" - - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" + + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" + + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" + + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" + + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" + + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" + + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" + + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 4bd6ca17d..aec59058e 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -52,82 +52,66 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" - - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" - - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" - - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" - - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" - - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" - - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" - - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" + + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" + + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" + + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" + + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" + + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" + + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" + + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index a9e7f91fc..5129ca6ee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -57,6 +57,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "nop " :"=r"(iamax),"=m"(*amax) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamax; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index faf5f9c65..05068b212 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -57,6 +57,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "nop " :"=r"(iamin),"=m"(*amin) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamin; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index b7214783f..cc6347127 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -132,7 +132,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index d53fdb6b8..18610daea 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -132,7 +132,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; From 29416cb5a37b990052d019f66736af5263a81809 Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 19:11:11 +0200 Subject: [PATCH 426/935] [ZARCH] Add Z13 version for max/min functions --- kernel/zarch/KERNEL.Z13 | 12 +-- kernel/zarch/damax_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/damin_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/dmax_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/dmin_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/zamax_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/zamin_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 1216 insertions(+), 6 deletions(-) create mode 100644 kernel/zarch/damax_z13.c create mode 100644 kernel/zarch/damin_z13.c create mode 100644 kernel/zarch/dmax_z13.c create mode 100644 kernel/zarch/dmin_z13.c create mode 100644 kernel/zarch/zamax_z13.c create mode 100644 kernel/zarch/zamin_z13.c diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index e5b974ab4..22c7e9703 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = damax.c +DAMAXKERNEL = damax_z13.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = zamax.c +ZAMAXKERNEL = zamax_z13.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = damin.c +DAMINKERNEL = damin_z13.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = zamin.c +ZAMINKERNEL = zamin_z13.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = dmax.c +DMAXKERNEL = dmax_z13.c SMINKERNEL = ../arm/min.c -DMINKERNEL = dmin.c +DMINKERNEL = dmin_z13.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c new file mode 100644 index 000000000..95b94ee4a --- /dev/null +++ b/kernel/zarch/damax_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c new file mode 100644 index 000000000..538690ee5 --- /dev/null +++ b/kernel/zarch/damin_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c new file mode 100644 index 000000000..83e7b02a8 --- /dev/null +++ b/kernel/zarch/dmax_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT max; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return max; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c new file mode 100644 index 000000000..e64f90ee3 --- /dev/null +++ b/kernel/zarch/dmin_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c new file mode 100644 index 000000000..ae711c173 --- /dev/null +++ b/kernel/zarch/zamax_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + maxf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c new file mode 100644 index 000000000..f82c57e81 --- /dev/null +++ b/kernel/zarch/zamin_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + minf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (minf); + + } else { + + minf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (minf); + } +} From 1249ee1fd0e62f5386b8b5dbce7b3d5fac785006 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:13:46 +0100 Subject: [PATCH 427/935] Add Z14 target from patch provided by aarnez in #991 --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 3d04a57cf..3a5a32234 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -94,3 +94,4 @@ THUNDERX2T99 9.System Z: ZARCH_GENERIC Z13 +Z14 From bdc73a49e0e3fe375fe2a015abebc962e29d72af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:14:37 +0100 Subject: [PATCH 428/935] Add parameters for Z14 from patch provided by aarnez in #991 --- param.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/param.h b/param.h index 15ea663a8..3cc400b54 100644 --- a/param.h +++ b/param.h @@ -2915,6 +2915,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(Z14) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 456 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 224 + +#define SGEMM_DEFAULT_Q 488 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 352 + +#define SGEMM_DEFAULT_R 8192 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 2048 + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC From 72d3e7c9b49af5c13ff1e26d13fc3b35ffd92076 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:15:50 +0100 Subject: [PATCH 429/935] Add FORCE Z14 from patch provided by aarnez in #991 --- getarch.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/getarch.c b/getarch.c index d03ce6e98..242d08004 100644 --- a/getarch.c +++ b/getarch.c @@ -1085,6 +1085,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z13" #endif +#ifdef FORCE_Z14 +#define FORCE +#define ARCHITECTURE "ZARCH" +#define SUBARCHITECTURE "Z14" +#define ARCHCONFIG "-DZ14 " \ + "-DDTB_DEFAULT_ENTRIES=64" +#define LIBNAME "z14" +#define CORENAME "Z14" +#endif + #ifndef FORCE #ifdef USER_TARGET From 4b512f84dd2b5861e6c860f68d05e56484efe7ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:16:44 +0100 Subject: [PATCH 430/935] Add cache sizes for Z14 from patch provided by aarnez in #991 --- cpuid_zarch.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 8ed40099b..896ed94f5 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -114,7 +114,14 @@ void get_cpuconfig(void) break; case CPU_Z14: printf("#define Z14\n"); + printf("#define L1_DATA_SIZE 131072\n"); + printf("#define L1_DATA_LINESIZE 256\n"); + printf("#define L1_DATA_ASSOCIATIVE 8\n"); + printf("#define L2_SIZE 4194304\n"); + printf("#define L2_LINESIZE 256\n"); + printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); break; } } From 885a3c435092f5356ee4665b03d3709ce58a22f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:18:09 +0100 Subject: [PATCH 431/935] USE_TRMM on Z14 from patch provided by aarnez in #991 --- kernel/Makefile.L3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 9258f216d..eafcfb1b4 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -48,6 +48,10 @@ ifeq ($(ARCH), zarch) USE_TRMM = 1 endif +ifeq ($(CORE), Z14) +USE_TRMM = 1 +endif + From 265142edd5dc4c8d7e5e9f781468ac9c5bddb3ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:21:40 +0100 Subject: [PATCH 432/935] Fix typo in the zarch min/max kernels from patch provided by aarnez in #991 --- kernel/zarch/damax.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index a3d63fe53..827467189 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -81,7 +81,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - "vfmaxdb %%v0,%%v0,%%16,8 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 738ed8710..821f9eccc 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -81,7 +81,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) "vfmindb %%v16,%%v16,%%v17,8 \n\t" - "vfmindb %%v0,%%v0,%%16,8 \n\t" + "vfmindb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index aa8b932f9..5ec54c7bf 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -74,7 +74,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" - "vfmaxdb %%v0,%%v0,%%16,0 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 8ae5fe868..073289186 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -74,7 +74,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) "vfmindb %%v16,%%v16,%%v17,0 \n\t" - "vfmindb %%v0,%%v0,%%16,0 \n\t" + "vfmindb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index c8d831d06..b629d64c0 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -81,7 +81,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - "vfmaxsb %%v0,%%v0,%%16,8 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index dd24c74d7..7ce6ee657 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -81,7 +81,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) "vfminsb %%v16,%%v16,%%v17,8 \n\t" - "vfminsb %%v0,%%v0,%%16,8 \n\t" + "vfminsb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 8a2b86dc1..e492d739c 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -74,7 +74,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" - "vfmaxsb %%v0,%%v0,%%16,0 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index b87ec0fe8..e7d83441b 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -74,7 +74,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) "vfminsb %%v16,%%v16,%%v17,0 \n\t" - "vfminsb %%v0,%%v0,%%16,0 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" From 877023e1e194faf5e42e2bb2d0771b52b52fed94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:22:26 +0100 Subject: [PATCH 433/935] Fix precision of zarch DSDOT from patch provided by aarnez in #991 --- kernel/zarch/dsdot.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 800bb0d51..72950c9f4 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -132,7 +132,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n) { - dot += y[i] * x[i] ; + dot += (double) y[i] * (double) x[i] ; i++ ; } @@ -146,7 +146,8 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n1) { - dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + dot += (double) y[iy] * (double) x[ix]; + dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; ix += inc_x*2 ; iy += inc_y*2 ; i+=2 ; @@ -156,7 +157,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n) { - dot += y[iy] * x[ix] ; + dot += (double) y[iy] * (double) x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; From cce574c3e0763af7a5017f20fa36959c896fc4fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:24:55 +0100 Subject: [PATCH 434/935] Improve the z14 SGEMVT kernel from patch provided by aarnez in #991 --- sgemv_t_4.c | 811 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 811 insertions(+) create mode 100644 sgemv_t_4.c diff --git a/sgemv_t_4.c b/sgemv_t_4.c new file mode 100644 index 000000000..a3136723a --- /dev/null +++ b/sgemv_t_4.c @@ -0,0 +1,811 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v4,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v4 \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "veslg %%v4,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v4 \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "veslg %%v4,%%v2,32 \n\t" + "vfasb %%v2,%%v2,%%v4 \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "veslg %%v4,%%v3,32 \n\t" + "vfasb %%v3,%%v3,%%v4 \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v2,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "veslg %%v2,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v2 \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + // a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + // y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j Date: Fri, 1 Feb 2019 12:57:01 +0100 Subject: [PATCH 435/935] Delete misplaced file sgemv_t_4.c from #1993 , file should have gone into kernel/zarch --- sgemv_t_4.c | 811 ---------------------------------------------------- 1 file changed, 811 deletions(-) delete mode 100644 sgemv_t_4.c diff --git a/sgemv_t_4.c b/sgemv_t_4.c deleted file mode 100644 index a3136723a..000000000 --- a/sgemv_t_4.c +++ /dev/null @@ -1,811 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v4,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v4 \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "ste %%f0,0(%6) \n\t" - "veslg %%v4,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v4 \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "ste %%f1,4(%6) \n\t" - "veslg %%v4,%%v2,32 \n\t" - "vfasb %%v2,%%v2,%%v4 \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "ste %%f2,8(%6) \n\t" - "veslg %%v4,%%v3,32 \n\t" - "vfasb %%v3,%%v3,%%v4 \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "ste %%f3,12(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v2,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "ste %%f0,0(%4) \n\t" - "veslg %%v2,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v2 \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "ste %%f1,4(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "ste %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } -} - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; - } - - if ( n2 & 2 ) - { - - sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; - - } - - if ( n2 & 1 ) - { - - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; - - } - a += NB; - x += NB * inc_x; - } - - if ( m3 == 0 ) return(0); - - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j Date: Fri, 1 Feb 2019 12:58:59 +0100 Subject: [PATCH 436/935] Fix incorrect sgemv results for IBM z14 part of PR #1993 that was inadvertently misplaced into the toplevel directory --- kernel/zarch/sgemv_t_4.c | 60 +++++++++++++++------------------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 5515d7bb7..a3136723a 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -158,32 +158,24 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "vrepf %%v4,%%v0,2 \n\t" - "aebr %%f0,%%f4 \n\t" - "vrepf %%v4,%%v0,3 \n\t" + "veslg %%v4,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v4 \n\t" + "vrepg %%v4,%%v0,1 \n\t" "aebr %%f0,%%f4 \n\t" "ste %%f0,0(%6) \n\t" - "vrepf %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "vrepf %%v4,%%v1,2 \n\t" - "aebr %%f1,%%f4 \n\t" - "vrepf %%v4,%%v1,3 \n\t" + "veslg %%v4,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v4 \n\t" + "vrepg %%v4,%%v1,1 \n\t" "aebr %%f1,%%f4 \n\t" "ste %%f1,4(%6) \n\t" - "vrepf %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "vrepf %%v4,%%v2,2 \n\t" - "aebr %%f2,%%f4 \n\t" - "vrepf %%v4,%%v2,3 \n\t" + "veslg %%v4,%%v2,32 \n\t" + "vfasb %%v2,%%v2,%%v4 \n\t" + "vrepg %%v4,%%v2,1 \n\t" "aebr %%f2,%%f4 \n\t" "ste %%f2,8(%6) \n\t" - "vrepf %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "vrepf %%v4,%%v3,2 \n\t" - "aebr %%f3,%%f4 \n\t" - "vrepf %%v4,%%v3,3 \n\t" + "veslg %%v4,%%v3,32 \n\t" + "vfasb %%v3,%%v3,%%v4 \n\t" + "vrepg %%v4,%%v3,1 \n\t" "aebr %%f3,%%f4 \n\t" "ste %%f3,12(%6) " : @@ -281,18 +273,14 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "vrepf %%v2,%%v0,2 \n\t" - "aebr %%f0,%%f2 \n\t" - "vrepf %%v2,%%v0,3 \n\t" + "veslg %%v2,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vrepg %%v2,%%v0,1 \n\t" "aebr %%f0,%%f2 \n\t" "ste %%f0,0(%4) \n\t" - "vrepf %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "vrepf %%v2,%%v1,2 \n\t" - "aebr %%f1,%%f2 \n\t" - "vrepf %%v2,%%v1,3 \n\t" + "veslg %%v2,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v2 \n\t" + "vrepg %%v2,%%v1,1 \n\t" "aebr %%f1,%%f2 \n\t" "ste %%f1,4(%4) " : @@ -349,7 +337,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "vl %%v31,112(%%r1,%1) \n\t" "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - + "agfi %%r1,128 \n\t" "brctg %%r0,0b \n\t" @@ -370,11 +358,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "vrepf %%v1,%%v0,3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepg %%v1,%%v0,1 \n\t" "aebr %%f0,%%f1 \n\t" "ste %%f0,0(%3) " : @@ -823,5 +809,3 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } - - From 4abc375a91d6a3bc97e180dca9f33750193ad281 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 1 Feb 2019 13:45:00 +0000 Subject: [PATCH 437/935] sgemv cgemv pairs --- kernel/power/KERNEL.POWER8 | 8 +- kernel/power/cgemv_n.c | 585 +++++++++++++++++++++++++++++++++++++ kernel/power/cgemv_t.c | 571 ++++++++++++++++++++++++++++++++++++ kernel/power/dgemv_t.c | 4 +- kernel/power/icamax.c | 81 ++++- kernel/power/sgemv_n.c | 465 +++++++++++++++++++++++++++++ kernel/power/sgemv_t.c | 480 ++++++++++++++++++++++++++++++ kernel/power/sgemv_t_8.c | 501 +++++++++++++++++++++++++++++++ kernel/power/zgemv_n_4.c | 22 +- kernel/power/zgemv_t_4.c | 6 +- utest/Makefile | 1 - 11 files changed, 2691 insertions(+), 33 deletions(-) create mode 100644 kernel/power/cgemv_n.c create mode 100644 kernel/power/cgemv_t.c create mode 100644 kernel/power/sgemv_n.c create mode 100644 kernel/power/sgemv_t.c create mode 100644 kernel/power/sgemv_t_8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index cbcffb8fe..e6f69c7c4 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c ZSWAPKERNEL = zswap.c # -#SGEMVNKERNEL = ../arm/gemv_n.c +SGEMVNKERNEL = sgemv_n.c DGEMVNKERNEL = dgemv_n.c -#CGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # -#SGEMVTKERNEL = ../arm/gemv_t.c +SGEMVTKERNEL = sgemv_t.c DGEMVTKERNEL = dgemv_t.c -#CGEMVTKERNEL = ../arm/zgemv_t.c +CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c new file mode 100644 index 000000000..cb01e196e --- /dev/null +++ b/kernel/power/cgemv_n.c @@ -0,0 +1,585 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include "common.h" +#include +#define NBMAX 1024 + + +static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; + register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; + register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; + register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; + register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; + register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; + register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; + register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; + register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; +#endif + register __vector float *vy = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + register __vector float *vptr_a2 = (__vector float *) a2; + register __vector float *vptr_a3 = (__vector float *) a3; + BLASLONG i = 0; + for (;i< n / 2; i+=2) { + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float va0 = vptr_a0[i]; + register __vector float va1 = vptr_a1[i]; + register __vector float va2 = vptr_a2[i]; + register __vector float va3 = vptr_a3[i]; + register __vector float va0_1 = vptr_a0[i + 1]; + register __vector float va1_1 = vptr_a1[i + 1]; + register __vector float va2_1 = vptr_a2[i + 1]; + register __vector float va3_1 = vptr_a3[i + 1]; + + vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; + va0 = vec_perm(va0, va0,swap_mask); + va0_1 = vec_perm(va0_1, va0_1,swap_mask); + va1 = vec_perm(va1, va1,swap_mask); + va1_1 = vec_perm(va1_1, va1_1,swap_mask); + va2 = vec_perm(va2, va2,swap_mask); + va2_1 = vec_perm(va2_1, va2_1,swap_mask); + va3 = vec_perm(va3, va3,swap_mask); + va3_1 = vec_perm(va3_1, va3_1,swap_mask); + vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; + vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + } + +} + + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; +#endif + register __vector float *vy = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + BLASLONG i = 0; + for (;i< n / 2; i+=2) { + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float va0 = vptr_a0[i]; + register __vector float va1 = vptr_a1[i]; + register __vector float va0_1 = vptr_a0[i + 1]; + register __vector float va1_1 = vptr_a1[i + 1]; + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + register __vector float va1x = vec_perm(va1, va1,swap_mask); + register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); + vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + } + +} + + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; +#endif + register __vector float *vy = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) ap; + BLASLONG i = 0; + for (;i< n / 2; i+=2) { + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float va0 = vptr_a0[i]; + register __vector float va0_1 = vptr_a0[i + 1]; + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + } +} + + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + + if (inc_dest != 2) { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i +static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + + for (i = 0; i < n / 2; i+=2) { + register __vector float vx_0 = v_x[i]; + register __vector float vx_1 = v_x[i+1]; + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; + vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; + vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; + vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; + vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; + vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; + vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; + vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif + +} + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + + for (i = 0; i < n / 2; i+=2) { + register __vector float vx_0 = v_x[i]; + register __vector float vx_1 = v_x[i+1]; + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; + vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; + vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; + vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif + +} + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + __vector float* va0 = (__vector float*) ap; + __vector float* v_x = (__vector float*) x; + + for (i = 0; i < n / 2; i+=2) { + register __vector float vx_0 = v_x[i]; + register __vector float vx_1 = v_x[i+1]; + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; + vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return (0); + } + + if (m3 == 1) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + return (0); + +} + diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 3974ed62d..b8589a131 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 8192 -#define PREFETCH 1 +#define NBMAX 1024 +//#define PREFETCH 1 #include #define HAVE_KERNEL4x8_ASM 1 diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index aa0531dc6..06fc5d8ad 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -36,9 +36,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code +#if !defined(USE_MASK_PERMUTATIONS) + +static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgew %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgow %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +#endif - /** * Find maximum index * Warning: requirements n>0 and n % 32 == 0 @@ -51,12 +76,16 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; BLASLONG i; +#if defined(USE_MASK_PERMUTATIONS) register __vector unsigned int static_index0 = {0,1,2,3}; +#else + register __vector unsigned int static_index0 = {2,0,3,1}; +#endif register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + register __vector unsigned int static_index1=static_index0 +temp0; + register __vector unsigned int static_index2=static_index0 +temp1; + register __vector unsigned int static_index3=static_index1 +temp1; temp0=vec_xor(temp0,temp0); temp1=temp1 <<1 ; //{16,16,16,16} register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} @@ -64,9 +93,11 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { register __vector float quadruple_values={0,0,0,0}; register __vector float * v_ptrx=(__vector float *)x; +#if defined(USE_MASK_PERMUTATIONS) register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - for(; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c new file mode 100644 index 000000000..96434a13f --- /dev/null +++ b/kernel/power/sgemv_t.c @@ -0,0 +1,480 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i] ; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c new file mode 100644 index 000000000..c9f928258 --- /dev/null +++ b/kernel/power/sgemv_t_8.c @@ -0,0 +1,501 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" +#include +#define NBMAX 2048 + +#include + +static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i +=2) { + register __vector float vx1=v_x[i] ; + register __vector float vx2=v_x[i+1] ; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float va4_1=va4[i] ; + register __vector float va4_2=va4[i+1] ; + register __vector float va5_1=va5[i] ; + register __vector float va5_2=va5[i+1] ; + register __vector float va6_1=va6[i] ; + register __vector float va6_2=va6[i+1] ; + register __vector float va7_1=va7[i] ; + register __vector float va7_2=va7[i+1] ; + temp0 += vx1* va0_1 + vx2 * va0_2; + temp1 += vx1* va1_1 + vx2 * va1_2; + temp2 += vx1* va2_1 + vx2 * va2_2; + temp3 += vx1* va3_1 + vx2 * va3_2; + temp4 += vx1* va4_1 + vx2 * va4_2; + temp5 += vx1* va5_1 + vx2 * va5_2; + temp6 += vx1* va6_1 + vx2 * va6_2; + temp7 += vx1* va7_1 + vx2 * va7_2; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + } + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 7; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 & 4) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp3 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 4 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; + y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; + y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; + aj += 16; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + aj += 4; + } + + } else if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; + y_ptr += inc_y; + aj += lda; + } + + } + if (m3==4) return (0); + a_ptr += 4; + } + + if (m3 & 2 ) { + + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + if (m3==2) return (0); + a_ptr += 2; + } + if (m3 & 1) { + + FLOAT xtemp = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + + } + a_ptr += 1; + } + return (0); + +} + diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 8b250a7f1..167b0a158 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -389,20 +389,14 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { register __vector double va0_2 = vptr_a0[i + 2]; register __vector double va0_3 = vptr_a0[i + 3]; - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - va0 = vec_xxpermdi(va0, va0, 2); - va0_1 = vec_xxpermdi(va0_1, va0_1, 2); - va0_2 = vec_xxpermdi(va0_2, va0_2, 2); - va0_3 = vec_xxpermdi(va0_3, va0_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; + register __vector double va0x = vec_xxpermdi(va0, va0, 2); + register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2); + register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2); + register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + vy_2 += va0_2*vx0_r + va0x_2*vx0_i; + vy_3 += va0_3*vx0_r + va0x_3*vx0_i; vy[i] = vy_0; vy[i + 1] = vy_1; diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 572206494..20a0812dd 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -59,11 +59,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA i = 0; n = n << 1; while (i < n) { -// __builtin_prefetch(&x[i]); -// __builtin_prefetch(&a0[i]); -// __builtin_prefetch(&a1[i]); -// __builtin_prefetch(&a2[i]); -// __builtin_prefetch(&a3[i]); + register __vector double vx_0 = *(__vector double*) (&x[i]); register __vector double vx_1 = *(__vector double*) (&x[i + 2]); register __vector double vx_2 = *(__vector double*) (&x[i + 4]); diff --git a/utest/Makefile b/utest/Makefile index e40b3c6db..550a65569 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -37,4 +37,3 @@ clean: -rm -f *.o $(UTESTBIN) libs: - From cd9ea45463b46d603ae4f8c8af033331c07abbc2 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 4 Feb 2019 06:57:11 +0000 Subject: [PATCH 438/935] NBMAX=4096 for gemvn, added sgemvn 8x8 for future --- kernel/power/sgemv_n.c | 2 +- kernel/power/sgemv_n_8.c | 507 +++++++++++++++++++++++++++++++++++++++ kernel/power/sgemv_t_8.c | 2 +- 3 files changed, 509 insertions(+), 2 deletions(-) create mode 100644 kernel/power/sgemv_n_8.c diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 56f08c2bf..9704757fe 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 2048 +#define NBMAX 4096 static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c new file mode 100644 index 000000000..d05b08f4e --- /dev/null +++ b/kernel/power/sgemv_n_8.c @@ -0,0 +1,507 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + register __vector float v_x0 = {x0,x0,x0,x0}; + register __vector float v_x1 = {x1,x1,x1,x1}; + register __vector float v_x2 = {x2,x2,x2,x2}; + register __vector float v_x3 = {x3,x3,x3,x3}; + register __vector float v_x4 = {x4,x4,x4,x4}; + register __vector float v_x5 = {x5,x5,x5,x5}; + register __vector float v_x6 = {x6,x6,x6,x6}; + register __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i+=2) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float vb0_1=vb0[i] ; + register __vector float vb0_2=vb0[i+1] ; + register __vector float vb1_1=vb1[i] ; + register __vector float vb1_2=vb1[i+1] ; + register __vector float vb2_1=vb2[i] ; + register __vector float vb2_2=vb2[i+1] ; + register __vector float vb3_1=vb3[i] ; + register __vector float vb3_2=vb3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i+=2 ) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } + +} + + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 7 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + + if ( m3 & 4 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + if ( lda == 4 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; + temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; + + temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; + temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; + temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; + temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; + + a_ptr += 16; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0] ; + a_ptr +=4; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + y_ptr += inc_y; + y_ptr[0] += alpha * temp3; + y_ptr += inc_y; + a += 4; + } + + + if ( m3 & 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + a += 2; + } + + if ( m3 & 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + + + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index c9f928258..e426f36c3 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#define NBMAX 2048 +#define NBMAX 4096 #include From 498ac98581accf80085c020874ad6a9513f95996 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 4 Feb 2019 15:41:56 +0000 Subject: [PATCH 439/935] Note for unused kernels --- kernel/power/sgemv_n_8.c | 6 ++++++ kernel/power/sgemv_t_8.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c index d05b08f4e..9bc93ced6 100644 --- a/kernel/power/sgemv_n_8.c +++ b/kernel/power/sgemv_n_8.c @@ -26,6 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could provide barebone for switching to inline assembly +*/ + #include "common.h" #define NBMAX 4096 diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index e426f36c3..5e9cd63ac 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -25,6 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could be used as base for switching to inline assembly +*/ + #include "common.h" #include #define NBMAX 4096 From 81daf6bc380c22bcc7ce228952e5435bc79bb0ce Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 07:30:38 +0200 Subject: [PATCH 440/935] [ZARCH] Format source code, Fix constraints --- kernel/zarch/camax.c | 370 +++++----- kernel/zarch/camin.c | 370 +++++----- kernel/zarch/casum.c | 236 +++---- kernel/zarch/caxpy.c | 232 +++---- kernel/zarch/ccopy.c | 102 ++- kernel/zarch/cdot.c | 254 ++++--- kernel/zarch/cgemv_n_4.c | 1263 +++++++++++++++++----------------- kernel/zarch/cgemv_t_4.c | 1179 ++++++++++++++++---------------- kernel/zarch/crot.c | 413 ++++++----- kernel/zarch/cscal.c | 684 +++++++++---------- kernel/zarch/cswap.c | 263 ++++--- kernel/zarch/damax.c | 220 +++--- kernel/zarch/damax_z13.c | 292 ++++---- kernel/zarch/damin.c | 220 +++--- kernel/zarch/damin_z13.c | 292 ++++---- kernel/zarch/dasum.c | 248 ++++--- kernel/zarch/daxpy.c | 253 ++++--- kernel/zarch/dcopy.c | 76 +-- kernel/zarch/ddot.c | 196 +++--- kernel/zarch/dgemv_n_4.c | 1200 +++++++++++++++----------------- kernel/zarch/dgemv_t_4.c | 1397 ++++++++++++++++++-------------------- kernel/zarch/dmax.c | 214 +++--- kernel/zarch/dmax_z13.c | 252 ++++--- kernel/zarch/dmin.c | 214 +++--- kernel/zarch/dmin_z13.c | 252 ++++--- kernel/zarch/drot.c | 381 +++++------ kernel/zarch/dscal.c | 278 ++++---- kernel/zarch/dsdot.c | 246 +++---- kernel/zarch/dswap.c | 228 +++---- kernel/zarch/icamax.c | 515 +++++++------- kernel/zarch/icamin.c | 515 +++++++------- kernel/zarch/idamax.c | 411 ++++++----- kernel/zarch/idamin.c | 411 ++++++----- kernel/zarch/idmax.c | 385 +++++------ kernel/zarch/idmin.c | 385 +++++------ kernel/zarch/isamax.c | 496 +++++++------- kernel/zarch/isamin.c | 496 +++++++------- kernel/zarch/ismax.c | 458 ++++++------- kernel/zarch/ismin.c | 458 ++++++------- kernel/zarch/izamax.c | 409 ++++++----- kernel/zarch/izamin.c | 409 ++++++----- kernel/zarch/samax.c | 225 +++--- kernel/zarch/samin.c | 225 +++--- kernel/zarch/sasum.c | 252 ++++--- kernel/zarch/saxpy.c | 253 ++++--- kernel/zarch/scopy.c | 76 +-- kernel/zarch/sdot.c | 188 ++--- kernel/zarch/sgemv_n_4.c | 1157 +++++++++++++++---------------- kernel/zarch/sgemv_t_4.c | 1380 ++++++++++++++++++------------------- kernel/zarch/smax.c | 219 +++--- kernel/zarch/smin.c | 219 +++--- kernel/zarch/srot.c | 381 +++++------ kernel/zarch/sscal.c | 268 ++++---- kernel/zarch/sswap.c | 230 +++---- kernel/zarch/zamax.c | 333 +++++---- kernel/zarch/zamax_z13.c | 352 +++++----- kernel/zarch/zamin.c | 317 ++++----- kernel/zarch/zamin_z13.c | 336 +++++---- kernel/zarch/zasum.c | 232 +++---- kernel/zarch/zaxpy.c | 232 +++---- kernel/zarch/zcopy.c | 102 ++- kernel/zarch/zdot.c | 246 ++++--- kernel/zarch/zgemv_n_4.c | 1147 +++++++++++++++---------------- kernel/zarch/zgemv_t_4.c | 1099 +++++++++++++++--------------- kernel/zarch/zrot.c | 413 ++++++----- kernel/zarch/zscal.c | 676 +++++++++--------- kernel/zarch/zswap.c | 263 ++++--- 67 files changed, 13393 insertions(+), 14601 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2e9648640..40a9903e9 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = camax_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + maxf = camax_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index aec59058e..842635afc 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = camin_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + minf = camin_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index f4ebc21bd..f59e5a20b 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,140 +28,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if ( inc_x == 1 ) - { + if (n <= 0 || inc_x <= 0) + return (sumf); - n1 = n & -32; - if ( n1 > 0 ) - { + if (inc_x == 1) { - sumf = casum_kernel_32(n1, x); - i=n1; - ip=2*n1; - } + n1 = n & -32; + if (n1 > 0) { - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = casum_kernel_32(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index fe5568cc8..d86342bd0 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,148 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepf %%v0,0(%3) \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v1,4(%3),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%3),1 \n\t" - "vlef %%v1,4(%3),3 \n\t" -#else - "vlef %%v0,0(%3),1 \n\t" - "vlef %%v0,0(%3),3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v0,0(%3),2 \n\t" - "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" +#else + "vlef %%v0,0(%[alpha]),1\n\t" + "vlef %%v0,0(%[alpha]),3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,0(%[alpha]),0\n\t" + "vlef %%v0,0(%[alpha]),2\n\t" + "vlrepf %%v1,4(%[alpha])\n\t" #endif - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if (n1) { - da[0] = da_r; - da[1] = da_i; - caxpy_kernel_16(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index fc0b8d648..1b93a812e 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - ccopy_kernel_32(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + ccopy_kernel_32(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index 3eda2979b..64d81ae5c 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,156 +27,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v24,%%v24,%%v28 \n\t" - "vfasb %%v24,%%v24,%%v30 \n\t" - "vrepg %%v26,%%v24,1 \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vfasb %%v25,%%v25,%%v29 \n\t" - "vfasb %%v25,%%v25,%%v31 \n\t" - "vrepg %%v27,%%v25,1 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vstef %%v24,0(%3),0 \n\t" - "vstef %%v24,4(%3),1 \n\t" - "vstef %%v25,8(%3),1 \n\t" - "vstef %%v25,12(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vrepg %%v26,%%v24,1\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vfasb %%v25,%%v25,%%v29\n\t" + "vfasb %%v25,%%v25,%%v31\n\t" + "vrepg %%v27,%%v25,1\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vstef %%v24,0(%[d]),0\n\t" + "vstef %%v24,4(%[d]),1\n\t" + "vstef %%v25,8(%[d]),1\n\t" + "vstef %%v25,12(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -16; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - cdot_kernel_16(n1, x, y, dot); + BLASLONG n1 = n & -16; - i = n1; - BLASLONG j = i * 2; + if (n1) + cdot_kernel_16(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index ed81325e1..db91d9063 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,719 +25,720 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 2048 -static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v18,16(%[x])\n\t" + "vlrepg %%v19,24(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlef %%v28,0(%%r1,%3),0 \n\t" - "vlef %%v28,0(%%r1,%3),1 \n\t" - "vlef %%v28,8(%%r1,%3),2 \n\t" - "vlef %%v28,8(%%r1,%3),3 \n\t" - "vlef %%v29,4(%%r1,%3),0 \n\t" - "vlef %%v29,4(%%r1,%3),1 \n\t" - "vlef %%v29,12(%%r1,%3),2 \n\t" - "vlef %%v29,12(%%r1,%3),3 \n\t" - "vlef %%v30,0(%%r1,%4),0 \n\t" - "vlef %%v30,0(%%r1,%4),1 \n\t" - "vlef %%v30,8(%%r1,%4),2 \n\t" - "vlef %%v30,8(%%r1,%4),3 \n\t" - "vlef %%v31,4(%%r1,%4),0 \n\t" - "vlef %%v31,4(%%r1,%4),1 \n\t" - "vlef %%v31,12(%%r1,%4),2 \n\t" - "vlef %%v31,12(%%r1,%4),3 \n\t" - - "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vperm %%v25,%%v24,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v24,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap1])\n\t" + "vperm %%v27,%%v26,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v26,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" + "vl %%v28,0(%%r1,%[ap2])\n\t" + "vperm %%v29,%%v28,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v28,%%v1\n\t" + "vl %%v30,0(%%r1,%[ap3])\n\t" + "vperm %%v31,%%v30,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v30,%%v1\n\t" + "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v20,0(%%r1,%[ap0])\n\t" + "vperm %%v21,%%v20,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v20,%%v1\n\t" + "vl %%v22,0(%%r1,%[ap1])\n\t" + "vperm %%v23,%%v22,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v22,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" #else - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v18,0(%%r1,%[ap])\n\t" + "vperm %%v19,%%v18,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v18,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" - "vlef %%v1,%4,3 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepf %%v0,%[alpha_r]\n\t" + "vlef %%v1,%[alpha_i],0\n\t" + "vlef %%v1,%[alpha_i],2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,%[alpha_i],1\n\t" + "vlef %%v1,%[alpha_i],3\n\t" #else - "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" - "vlrepf %%v1,%4 \n\t" + "vlef %%v0,%[alpha_r],1\n\t" + "vlef %%v0,%[alpha_r],3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,%[alpha_r],0\n\t" + "vlef %%v0,%[alpha_r],2\n\t" + "vlrepf %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,0(%%r1,%2) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - - "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" - - "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" - - "vst %%v22,0(%%r1,%2) \n\t" - "vst %%v23,16(%%r1,%2) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,0(%%r1,%[dest])\n\t" + "vl %%v19,16(%%r1,%[dest])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" + "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" + "vst %%v22,0(%%r1,%[dest])\n\t" + "vst %%v23,16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return(0); - } + return (0); + } + if (m3 == 1) { - if ( m3 == 1 ) - { + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index f04a624ac..669d78a9d 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - crot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index 0c15c5add..a2d5bf223 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,430 +27,400 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "vlef %%v1,4(%1),0 \n\t" - "vlef %%v1,4(%1),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%1),1 \n\t" - "vlef %%v1,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - "verllg %%v28,%%v20,32 \n\t" - "verllg %%v29,%%v21,32 \n\t" - "verllg %%v30,%%v22,32 \n\t" - "verllg %%v31,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlef %%v0,4(%1),0 \n\t" - "vlef %%v0,4(%1),2 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,4(%1),1 \n\t" - "vlef %%v0,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v16,%%v16,32 \n\t" - "verllg %%v17,%%v17,32 \n\t" - "verllg %%v18,%%v18,32 \n\t" - "verllg %%v19,%%v19,32 \n\t" - "verllg %%v20,%%v20,32 \n\t" - "verllg %%v21,%%v21,32 \n\t" - "verllg %%v22,%%v22,32 \n\t" - "verllg %%v23,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v24,%%v16,32\n\t" + "verllg %%v25,%%v17,32\n\t" + "verllg %%v26,%%v18,32\n\t" + "verllg %%v27,%%v19,32\n\t" + "verllg %%v28,%%v20,32\n\t" + "verllg %%v29,%%v21,32\n\t" + "verllg %%v30,%%v22,32\n\t" + "verllg %%v31,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlef %%v0,4(%[alpha]),0\n\t" + "vlef %%v0,4(%[alpha]),2\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,4(%[alpha]),1\n\t" + "vlef %%v0,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v16,%%v16,32\n\t" + "verllg %%v17,%%v17,32\n\t" + "verllg %%v18,%%v18,32\n\t" + "verllg %%v19,%%v19,32\n\t" + "verllg %%v20,%%v20,32\n\t" + "verllg %%v21,%%v21,32\n\t" + "verllg %%v22,%%v22,32\n\t" + "verllg %%v23,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -16; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -16; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - cscal_kernel_16_zero(n1, x); - else - cscal_kernel_16_zero_r(n1, alpha, x); - else - if (da_i == 0) - cscal_kernel_16_zero_i(n1, alpha, x); - else - cscal_kernel_16(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 256995d50..92a81591f 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - cswap_kernel_32(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + cswap_kernel_32(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 827467189..37008f702 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,8\n\t" + "vfmaxdb %%v17,%%v17,%%v25,8\n\t" + "vfmaxdb %%v18,%%v18,%%v26,8\n\t" + "vfmaxdb %%v19,%%v19,%%v27,8\n\t" + "vfmaxdb %%v20,%%v20,%%v28,8\n\t" + "vfmaxdb %%v21,%%v21,%%v29,8\n\t" + "vfmaxdb %%v22,%%v22,%%v30,8\n\t" + "vfmaxdb %%v23,%%v23,%%v31,8\n\t" + "vfmaxdb %%v16,%%v16,%%v20,8\n\t" + "vfmaxdb %%v17,%%v17,%%v21,8\n\t" + "vfmaxdb %%v18,%%v18,%%v22,8\n\t" + "vfmaxdb %%v19,%%v19,%%v23,8\n\t" + "vfmaxdb %%v16,%%v16,%%v18,8\n\t" + "vfmaxdb %%v17,%%v17,%%v19,8\n\t" + "vfmaxdb %%v16,%%v16,%%v17,8\n\t" + "vfmaxdb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 95b94ee4a..530d6e5bb 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 821f9eccc..a01791741 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,8 \n\t" - "vfmindb %%v17,%%v17,%%v25,8 \n\t" - "vfmindb %%v18,%%v18,%%v26,8 \n\t" - "vfmindb %%v19,%%v19,%%v27,8 \n\t" - "vfmindb %%v20,%%v20,%%v28,8 \n\t" - "vfmindb %%v21,%%v21,%%v29,8 \n\t" - "vfmindb %%v22,%%v22,%%v30,8 \n\t" - "vfmindb %%v23,%%v23,%%v31,8 \n\t" - - "vfmindb %%v16,%%v16,%%v20,8 \n\t" - "vfmindb %%v17,%%v17,%%v21,8 \n\t" - "vfmindb %%v18,%%v18,%%v22,8 \n\t" - "vfmindb %%v19,%%v19,%%v23,8 \n\t" - - "vfmindb %%v16,%%v16,%%v18,8 \n\t" - "vfmindb %%v17,%%v17,%%v19,8 \n\t" - - "vfmindb %%v16,%%v16,%%v17,8 \n\t" - - "vfmindb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,8\n\t" + "vfmindb %%v17,%%v17,%%v25,8\n\t" + "vfmindb %%v18,%%v18,%%v26,8\n\t" + "vfmindb %%v19,%%v19,%%v27,8\n\t" + "vfmindb %%v20,%%v20,%%v28,8\n\t" + "vfmindb %%v21,%%v21,%%v29,8\n\t" + "vfmindb %%v22,%%v22,%%v30,8\n\t" + "vfmindb %%v23,%%v23,%%v31,8\n\t" + "vfmindb %%v16,%%v16,%%v20,8\n\t" + "vfmindb %%v17,%%v17,%%v21,8\n\t" + "vfmindb %%v18,%%v18,%%v22,8\n\t" + "vfmindb %%v19,%%v19,%%v23,8\n\t" + "vfmindb %%v16,%%v16,%%v18,8\n\t" + "vfmindb %%v17,%%v17,%%v19,8\n\t" + "vfmindb %%v16,%%v16,%%v17,8\n\t" + "vfmindb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 538690ee5..2172b6d6f 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index fea431c34..9f69a9931 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,145 +28,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabs + +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -32; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = dasum_kernel_32(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -32; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = dasum_kernel_32(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index e8823745e..179ef8834 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepg %%v0,%3 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepg %%v0,%[alpha]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - daxpy_kernel_32(n1, x, y , &da); + if (n1) + daxpy_kernel_32(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index bb5325693..f7cbf54b2 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - dcopy_kernel_32(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dcopy_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index ff4c347a6..f5f601717 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,123 +27,127 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = ddot_kernel_16(n1, x, y); + if (n1) + dot = ddot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { - - dot += y[i] * x[i] ; - i++ ; - - } - return(dot); + i = n1; + while (i < n) { + dot += y[i] * x[i]; + i++; } + return (dot); - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; + } - BLASLONG n1 = n & -4; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = y[iy] * x[ix] ; - FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + while (i < n1) { - FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; - FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + FLOAT m1 = y[iy] * x[ix]; + FLOAT m2 = y[iy + inc_y] * x[ix + inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; + FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x]; + FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x]; - temp1 += m1+m3; - temp2 += m2+m4; + ix += inc_x * 4; + iy += inc_y * 4; - i+=4 ; + temp1 += m1 + m3; + temp2 += m2 + m4; - } + i += 4; - while(i < n) - { + } - temp1 += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - dot = temp1 + temp2; - return(dot); - -} + temp1 += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + dot = temp1 + temp2; + return (dot); +} diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca4fd6170..c93ff9b54 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,663 +29,579 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%5) \n\t" - "vlrepg %%v1,8(%5) \n\t" - "vlrepg %%v2,16(%5) \n\t" - "vlrepg %%v3,24(%5) \n\t" - "vlrepg %%v4,%7 \n\t" - "vfmdb %%v0,%%v0,%%v4 \n\t" - "vfmdb %%v1,%%v1,%%v4 \n\t" - "vfmdb %%v2,%%v2,%%v4 \n\t" - "vfmdb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,16(%[x])\n\t" + "vlrepg %%v3,24(%[x])\n\t" + "vlrepg %%v4,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v4\n\t" + "vfmdb %%v1,%%v1,%%v4\n\t" + "vfmdb %%v2,%%v2,%%v4\n\t" + "vfmdb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%3) \n\t" - "vlrepg %%v1,8(%3) \n\t" - "vlrepg %%v2,%5 \n\t" - "vfmdb %%v0,%%v0,%%v2 \n\t" - "vfmdb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v2\n\t" + "vfmdb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%2) \n\t" - "vlrepg %%v1,%4 \n\t" - "vfmdb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v16,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" + "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" + "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 8); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 2d8fa0d10..24680cf1b 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,795 +29,724 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "adbr %%f0,%%f4 \n\t" - "std %%f0,0(%6) \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "adbr %%f1,%%f4 \n\t" - "std %%f1,8(%6) \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "adbr %%f2,%%f4 \n\t" - "std %%f2,16(%6) \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "adbr %%f3,%%f4 \n\t" - "std %%f3,24(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v2,%%v2,%%v6\n\t" + "vfadb %%v3,%%v3,%%v7\n\t" + "vrepg %%v4,%%v0,1\n\t" + "adbr %%f0,%%f4\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v4,%%v1,1\n\t" + "adbr %%f1,%%f4\n\t" + "std %%f1,8(%[y])\n\t" + "vrepg %%v4,%%v2,1\n\t" + "adbr %%f2,%%f4\n\t" + "std %%f2,16(%[y])\n\t" + "vrepg %%v4,%%v3,1\n\t" + "adbr %%f3,%%f4\n\t" + "std %%f3,24(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "adbr %%f0,%%f2 \n\t" - "std %%f0,0(%4) \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "adbr %%f1,%%f2 \n\t" - "std %%f1,8(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v1,%%v1,%%v3\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v1,%%v1,%%v7\n\t" + "vrepg %%v2,%%v0,1\n\t" + "adbr %%f0,%%f2\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v2,%%v1,1\n\t" + "adbr %%f1,%%f2\n\t" + "std %%f1,8(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "std %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "std %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepg %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - yp = ytemp; + y_ptr = y; + a_ptr = a; + x_ptr = x; - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; - } + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - if ( n2 & 2 ) - { + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + yp = ytemp; + for (i = 0; i < nb1; i++) { + dgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 1 ) - { - - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + } - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + yp = ytemp; - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - if ( inc_y == 1 ) - { + if (n2 & 2) { - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; + dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - for ( j=0; j< ( n & -4 ); j+=4 ) - { + } - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; - y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2; - y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2; - y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2; - aj += lda4; - } + if (n2 & 1) { - for ( ; j< n ; j++ ) - { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + // a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + // y_ptr += inc_y; - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ; - aj += lda; - } + } + a += NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; - } - else - { + FLOAT *aj = a_ptr; + y_ptr = y; - for ( j=0; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 83e7b02a8..87bccbe55 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT max; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return max; +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT max; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return max; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) return (maxf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 073289186..518cc262c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v17,%%v17,%%v25,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v19,%%v19,%%v27,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v21,%%v21,%%v29,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - "vfmindb %%v23,%%v23,%%v31,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v17,%%v17,%%v21,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfmindb %%v19,%%v19,%%v23,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vfmindb %%v17,%%v17,%%v19,0 \n\t" - - "vfmindb %%v16,%%v16,%%v17,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v17,%%v17,%%v25,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v19,%%v19,%%v27,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v21,%%v21,%%v29,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v23,%%v23,%%v31,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v17,%%v17,%%v21,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v19,%%v19,%%v23,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v17,%%v17,%%v19,0\n\t" + "vfmindb %%v16,%%v16,%%v17,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index e64f90ee3..91561992f 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index c91f95800..8f0197f02 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - drot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + drot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index ccc6dd95d..c944990b5 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,179 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepg %%v0,%[da]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmdb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmdb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmdb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmdb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmdb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmdb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmdb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmdb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - - dscal_kernel_16_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - dscal_kernel_16(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + + if (da == 0.0) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + dscal_kernel_16_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + dscal_kernel_16(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -4; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -4; - x[i]=0.0; - x[i + inc_x]=0.0; - x[i + 2 * inc_x]=0.0; - x[i + 3 * inc_x]=0.0; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = 0.0; + x[i + inc_x] = 0.0; + x[i + 2 * inc_x] = 0.0; + x[i + 3 * inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 4; + j += 4; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -4; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -4; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; - x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; - x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; + x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; + x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; - } + i += inc_x * 4; + j += 4; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 72950c9f4..1ac02d4b9 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,144 +27,146 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - double dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v16,4(%%r1,%2),2 \n\t" - "vlef %%v17,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),2 \n\t" - "vlef %%v18,16(%%r1,%2),0 \n\t" - "vlef %%v18,20(%%r1,%2),2 \n\t" - "vlef %%v19,24(%%r1,%2),0 \n\t" - "vlef %%v19,28(%%r1,%2),2 \n\t" - "vlef %%v20,32(%%r1,%2),0 \n\t" - "vlef %%v20,36(%%r1,%2),2 \n\t" - "vlef %%v21,40(%%r1,%2),0 \n\t" - "vlef %%v21,44(%%r1,%2),2 \n\t" - "vlef %%v22,48(%%r1,%2),0 \n\t" - "vlef %%v22,52(%%r1,%2),2 \n\t" - "vlef %%v23,56(%%r1,%2),0 \n\t" - "vlef %%v23,60(%%r1,%2),2 \n\t" - - "vflls %%v16,%%v16 \n\t" - "vflls %%v17,%%v17 \n\t" - "vflls %%v18,%%v18 \n\t" - "vflls %%v19,%%v19 \n\t" - "vflls %%v20,%%v20 \n\t" - "vflls %%v21,%%v21 \n\t" - "vflls %%v22,%%v22 \n\t" - "vflls %%v23,%%v23 \n\t" - - "vlef %%v24,0(%%r1,%3),0 \n\t" - "vlef %%v24,4(%%r1,%3),2 \n\t" - "vflls %%v24,%%v24 \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vlef %%v25,8(%%r1,%3),0 \n\t" - "vlef %%v25,12(%%r1,%3),2 \n\t" - "vflls %%v25,%%v25 \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vlef %%v26,16(%%r1,%3),0 \n\t" - "vlef %%v26,20(%%r1,%3),2 \n\t" - "vflls %%v26,%%v26 \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vlef %%v27,24(%%r1,%3),0 \n\t" - "vlef %%v27,28(%%r1,%3),2 \n\t" - "vflls %%v27,%%v27 \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vlef %%v28,32(%%r1,%3),0 \n\t" - "vlef %%v28,36(%%r1,%3),2 \n\t" - "vflls %%v28,%%v28 \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vlef %%v29,40(%%r1,%3),0 \n\t" - "vlef %%v29,44(%%r1,%3),2 \n\t" - "vflls %%v29,%%v29 \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vlef %%v30,48(%%r1,%3),0 \n\t" - "vlef %%v30,52(%%r1,%3),2 \n\t" - "vflls %%v30,%%v30 \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vlef %%v31,56(%%r1,%3),0 \n\t" - "vlef %%v31,60(%%r1,%3),2 \n\t" - "vflls %%v31,%%v31 \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + double dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vlef %%v16,0(%%r1,%[x]),0\n\t" + "vlef %%v16,4(%%r1,%[x]),2\n\t" + "vlef %%v17,8(%%r1,%[x]),0\n\t" + "vlef %%v17,12(%%r1,%[x]),2\n\t" + "vlef %%v18,16(%%r1,%[x]),0\n\t" + "vlef %%v18,20(%%r1,%[x]),2\n\t" + "vlef %%v19,24(%%r1,%[x]),0\n\t" + "vlef %%v19,28(%%r1,%[x]),2\n\t" + "vlef %%v20,32(%%r1,%[x]),0\n\t" + "vlef %%v20,36(%%r1,%[x]),2\n\t" + "vlef %%v21,40(%%r1,%[x]),0\n\t" + "vlef %%v21,44(%%r1,%[x]),2\n\t" + "vlef %%v22,48(%%r1,%[x]),0\n\t" + "vlef %%v22,52(%%r1,%[x]),2\n\t" + "vlef %%v23,56(%%r1,%[x]),0\n\t" + "vlef %%v23,60(%%r1,%[x]),2\n\t" + "vflls %%v16,%%v16\n\t" + "vflls %%v17,%%v17\n\t" + "vflls %%v18,%%v18\n\t" + "vflls %%v19,%%v19\n\t" + "vflls %%v20,%%v20\n\t" + "vflls %%v21,%%v21\n\t" + "vflls %%v22,%%v22\n\t" + "vflls %%v23,%%v23\n\t" + "vlef %%v24,0(%%r1,%[y]),0\n\t" + "vlef %%v24,4(%%r1,%[y]),2\n\t" + "vflls %%v24,%%v24\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vlef %%v25,8(%%r1,%[y]),0\n\t" + "vlef %%v25,12(%%r1,%[y]),2\n\t" + "vflls %%v25,%%v25\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vlef %%v26,16(%%r1,%[y]),0\n\t" + "vlef %%v26,20(%%r1,%[y]),2\n\t" + "vflls %%v26,%%v26\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vlef %%v27,24(%%r1,%[y]),0\n\t" + "vlef %%v27,28(%%r1,%[y]),2\n\t" + "vflls %%v27,%%v27\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vlef %%v28,32(%%r1,%[y]),0\n\t" + "vlef %%v28,36(%%r1,%[y]),2\n\t" + "vflls %%v28,%%v28\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vlef %%v29,40(%%r1,%[y]),0\n\t" + "vlef %%v29,44(%%r1,%[y]),2\n\t" + "vflls %%v29,%%v29\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vlef %%v30,48(%%r1,%[y]),0\n\t" + "vlef %%v30,52(%%r1,%[y]),2\n\t" + "vflls %%v30,%%v30\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vlef %%v31,56(%%r1,%[y]),0\n\t" + "vlef %%v31,60(%%r1,%[y]),2\n\t" + "vflls %%v31,%%v31\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - double dot = 0.0 ; + double dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = dsdot_kernel_16(n1,x,y); + if (n1) + dot = dsdot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += (double) y[i] * (double) x[i] ; - i++ ; + dot += (double) y[i] * (double) x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += (double) y[iy] * (double) x[ix]; + dot += (double) y[iy + inc_y] * (double) x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += (double) y[iy] * (double) x[ix]; - dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += (double) y[iy] * (double) x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += (double) y[iy] * (double) x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 8070ef41a..60ba40bd6 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1 )) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - dswap_kernel_32(n1, x, y); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dswap_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - + + } + return (0); } diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 5129ca6ee..1e1040a6e 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamax; +} - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (max); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + max = icamax_kernel_32(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - return iamax; -} + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (max + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + max = 0; + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - max = icamax_kernel_32(n1, x, &maxf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); + ix += inc_x2 * 4; - } else { - - max = 0; - maxf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 05068b212..d1c0e32a1 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamin; +} - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (min); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + min = icamin_kernel_32(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - return iamin; -} + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (min + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + min = 0; + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - min = icamin_kernel_32(n1, x, &minf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); + ix += inc_x2 * 4; - } else { - - min = 0; - minf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a7c..8434c811f 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = idamax_kernel_32(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = idamax_kernel_32(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } - max = 0; - maxf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + max = 0; + maxf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f7282f..80a37e6c2 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - - if (n <= 0 || inc_x <= 0) return (min); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = idamin_kernel_32(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } - min = 0; - minf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + min = 0; + minf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c3040779..18cdba437 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imax; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) + return (max); - if (n <= 0 || inc_x <= 0) return (max); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + max = idmax_kernel_32(n1, x, &maxf); - max = idmax_kernel_32(n1, x, &maxf); + i = n1; + } else { + maxf = x[0]; + i++; + } - i = n1; - } - else - { - maxf = x[0]; - i++; - } + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + } else { - } else { + max = 0; + maxf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - max = 0; - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a49..02ca427e4 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imin; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) + return (min); - if (n <= 0 || inc_x <= 0) return (min); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + min = idmin_kernel_32(n1, x, &minf); - min = idmin_kernel_32(n1, x, &minf); + i = n1; + } else { + minf = x[0]; + i++; + } - i = n1; - } - else - { - minf = x[0]; - i++; - } + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + } else { - } else { + min = 0; + minf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - min = 0; - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 6e0aaa162..bbb4012aa 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = isamax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = isamax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - max = 0; - maxf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 266c48f7f..e8b34b934 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = isamin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = isamin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - min = 0; - minf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index c968ce6fa..a565df503 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; + return imax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = ismax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + max = ismax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - max = 0; - maxf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = x[0]; - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 0145b31b3..ff72b2c64 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; + return imin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = ismin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + min = ismin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - min = 0; - minf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = x[0]; - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2d1cc2365..48afb8215 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamax; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamax; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (max); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + max = izamax_kernel_16(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - max = izamax_kernel_16(n1, x, &maxf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (max + 1); + return (max + 1); + + } else { - } else { - max = 0; - maxf = CABS1(x,0); + maxf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 676fd7c6d..3edbe3d58 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamin; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamin; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (min); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + min = izamin_kernel_16(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - min = izamin_kernel_16(n1, x, &minf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (min + 1); + return (min + 1); + + } else { - } else { - min = 0; - minf = CABS1(x,0); + minf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index b629d64c0..efbc0318c 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,8\n\t" + "vfmaxsb %%v17,%%v17,%%v25,8\n\t" + "vfmaxsb %%v18,%%v18,%%v26,8\n\t" + "vfmaxsb %%v19,%%v19,%%v27,8\n\t" + "vfmaxsb %%v20,%%v20,%%v28,8\n\t" + "vfmaxsb %%v21,%%v21,%%v29,8\n\t" + "vfmaxsb %%v22,%%v22,%%v30,8\n\t" + "vfmaxsb %%v23,%%v23,%%v31,8\n\t" + "vfmaxsb %%v16,%%v16,%%v20,8\n\t" + "vfmaxsb %%v17,%%v17,%%v21,8\n\t" + "vfmaxsb %%v18,%%v18,%%v22,8\n\t" + "vfmaxsb %%v19,%%v19,%%v23,8\n\t" + "vfmaxsb %%v16,%%v16,%%v18,8\n\t" + "vfmaxsb %%v17,%%v17,%%v19,8\n\t" + "vfmaxsb %%v16,%%v16,%%v17,8\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = samax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = samax_kernel_64(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 7ce6ee657..138836ce5 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,8 \n\t" - "vfminsb %%v17,%%v17,%%v25,8 \n\t" - "vfminsb %%v18,%%v18,%%v26,8 \n\t" - "vfminsb %%v19,%%v19,%%v27,8 \n\t" - "vfminsb %%v20,%%v20,%%v28,8 \n\t" - "vfminsb %%v21,%%v21,%%v29,8 \n\t" - "vfminsb %%v22,%%v22,%%v30,8 \n\t" - "vfminsb %%v23,%%v23,%%v31,8 \n\t" - - "vfminsb %%v16,%%v16,%%v20,8 \n\t" - "vfminsb %%v17,%%v17,%%v21,8 \n\t" - "vfminsb %%v18,%%v18,%%v22,8 \n\t" - "vfminsb %%v19,%%v19,%%v23,8 \n\t" - - "vfminsb %%v16,%%v16,%%v18,8 \n\t" - "vfminsb %%v17,%%v17,%%v19,8 \n\t" - - "vfminsb %%v16,%%v16,%%v17,8 \n\t" - - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,8\n\t" + "vfminsb %%v17,%%v17,%%v25,8\n\t" + "vfminsb %%v18,%%v18,%%v26,8\n\t" + "vfminsb %%v19,%%v19,%%v27,8\n\t" + "vfminsb %%v20,%%v20,%%v28,8\n\t" + "vfminsb %%v21,%%v21,%%v29,8\n\t" + "vfminsb %%v22,%%v22,%%v30,8\n\t" + "vfminsb %%v23,%%v23,%%v31,8\n\t" + "vfminsb %%v16,%%v16,%%v20,8\n\t" + "vfminsb %%v17,%%v17,%%v21,8\n\t" + "vfminsb %%v18,%%v18,%%v22,8\n\t" + "vfminsb %%v19,%%v19,%%v23,8\n\t" + "vfminsb %%v16,%%v16,%%v18,8\n\t" + "vfminsb %%v17,%%v17,%%v19,8\n\t" + "vfminsb %%v16,%%v16,%%v17,8\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = samin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = samin_kernel_64(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index 2c59ab2e5..0c3057a92 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,147 +28,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabsf + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -64; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = sasum_kernel_64(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -64; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = sasum_kernel_64(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index 26ead310c..e41e87af0 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepf %%v0,%3 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepf %%v0,%[alpha]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -64; - if ( n1 ) - saxpy_kernel_64(n1, x, y , &da); + if (n1) + saxpy_kernel_64(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index ff4227595..44d27b062 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,6 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - scopy_kernel_64(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index 5ddbc69bd..f659b0c8a 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,114 +27,118 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepf %%v1,%%v0,1 \n\t" - "vrepf %%v2,%%v0,2 \n\t" - "vrepf %%v3,%%v0,3 \n\t" - "aebr %%f0,%%f1 \n\t" - "aebr %%f0,%%f2 \n\t" - "aebr %%f0,%%f3 \n\t" - "ler %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "vrepf %%v1,%%v0,1\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepf %%v3,%%v0,3\n\t" + "aebr %%f0,%%f1\n\t" + "aebr %%f0,%%f2\n\t" + "aebr %%f0,%%f3\n\t" + "ler %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - dot = sdot_kernel_32(n1,x,y); + if (n1) + dot = sdot_kernel_32(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += y[i] * x[i] ; - i++ ; + dot += y[i] * x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 01d8414de..86ac24993 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,640 +29,559 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%5) \n\t" - "vlrepf %%v1,4(%5) \n\t" - "vlrepf %%v2,8(%5) \n\t" - "vlrepf %%v3,12(%5) \n\t" - "vlrepf %%v4,%7 \n\t" - "vfmsb %%v0,%%v0,%%v4 \n\t" - "vfmsb %%v1,%%v1,%%v4 \n\t" - "vfmsb %%v2,%%v2,%%v4 \n\t" - "vfmsb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,8(%[x])\n\t" + "vlrepf %%v3,12(%[x])\n\t" + "vlrepf %%v4,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v4\n\t" + "vfmsb %%v1,%%v1,%%v4\n\t" + "vfmsb %%v2,%%v2,%%v4\n\t" + "vfmsb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%3) \n\t" - "vlrepf %%v1,4(%3) \n\t" - "vlrepf %%v2,%5 \n\t" - "vfmsb %%v0,%%v0,%%v2 \n\t" - "vfmsb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v2\n\t" + "vfmsb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%2) \n\t" - "vlrepf %%v1,%4 \n\t" - "vfmsb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v16,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,0(%%r1,%[y])\n\t" + "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" + "vst %%v17,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 4); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index a3136723a..6ae9b6d7f 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,783 +29,717 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v4,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v4 \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "ste %%f0,0(%6) \n\t" - "veslg %%v4,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v4 \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "ste %%f1,4(%6) \n\t" - "veslg %%v4,%%v2,32 \n\t" - "vfasb %%v2,%%v2,%%v4 \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "ste %%f2,8(%6) \n\t" - "veslg %%v4,%%v3,32 \n\t" - "vfasb %%v3,%%v3,%%v4 \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "ste %%f3,12(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v2,%%v2,%%v6\n\t" + "vfasb %%v3,%%v3,%%v7\n\t" + "veslg %%v4,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vrepg %%v4,%%v0,1\n\t" + "aebr %%f0,%%f4\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v4,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v4\n\t" + "vrepg %%v4,%%v1,1\n\t" + "aebr %%f1,%%f4\n\t" + "ste %%f1,4(%[y])\n\t" + "veslg %%v4,%%v2,32\n\t" + "vfasb %%v2,%%v2,%%v4\n\t" + "vrepg %%v4,%%v2,1\n\t" + "aebr %%f2,%%f4\n\t" + "ste %%f2,8(%[y])\n\t" + "veslg %%v4,%%v3,32\n\t" + "vfasb %%v3,%%v3,%%v4\n\t" + "vrepg %%v4,%%v3,1\n\t" + "aebr %%f3,%%f4\n\t" + "ste %%f3,12(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v2,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "ste %%f0,0(%4) \n\t" - "veslg %%v2,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v2 \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "ste %%f1,4(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v1,%%v1,%%v3\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v1,%%v1,%%v7\n\t" + "veslg %%v2,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vrepg %%v2,%%v0,1\n\t" + "aebr %%f0,%%f2\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v2,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v2\n\t" + "vrepg %%v2,%%v1,1\n\t" + "aebr %%f1,%%f2\n\t" + "ste %%f1,4(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "ste %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "veslg %%v1,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vrepg %%v1,%%v0,1\n\t" + "aebr %%f0,%%f1\n\t" + "ste %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepf %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - } + y_ptr = y; + a_ptr = a; + x_ptr = x; + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - yp = ytemp; + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + + yp = ytemp; + for (i = 0; i < nb1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 2 ) - { + } - sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + } - } + yp = ytemp; - if ( n2 & 1 ) - { + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; + } + if (n1 > 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + if (n2 & 2) { + + sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + if (n2 & 1) { - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = smax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = smax_kernel_64(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e7d83441b..2e9c793c4 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v17,%%v17,%%v25,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v19,%%v19,%%v27,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v21,%%v21,%%v29,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - "vfminsb %%v23,%%v23,%%v31,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v17,%%v17,%%v21,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfminsb %%v19,%%v19,%%v23,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vfminsb %%v17,%%v17,%%v19,0 \n\t" - - "vfminsb %%v16,%%v16,%%v17,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v17,%%v17,%%v25,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v19,%%v19,%%v27,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v21,%%v21,%%v29,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v23,%%v23,%%v31,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v17,%%v17,%%v21,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v19,%%v19,%%v23,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v17,%%v17,%%v19,0\n\t" + "vfminsb %%v16,%%v16,%%v17,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = smin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = smin_kernel_64(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 763cc664a..5b21a19dc 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - srot_kernel_64(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -64; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index c18a7e56f..07e6845c6 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,175 +27,147 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepf %%v0,%[da]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmsb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmsb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmsb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmsb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmsb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmsb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmsb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmsb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - - sscal_kernel_32_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - sscal_kernel_32(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + if (da == 0.0) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + sscal_kernel_32_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + sscal_kernel_32(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -2; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -2; - x[i]=0.0; - x[i + inc_x]=0.0; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = 0.0; + x[i + inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 2; + j += 2; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -2; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -2; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; - } + i += inc_x * 2; + j += 2; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index d0c0dc3f4..dc7113143 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,138 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - sswap_kernel_64(n1, x, y); - i=n1; - } + if (n <= 0) + return (0); - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + sswap_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index cc6347127..531e47a0b 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index ae711c173..cac2da938 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 18610daea..940d81dd2 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index f82c57e81..7417e0b74 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 8faaf20eb..43ae8ff8b 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,138 +28,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if (n <= 0 || inc_x <= 0) return(sumf); + if (n <= 0 || inc_x <= 0) + return (sumf); - if ( inc_x == 1 ) - { + if (inc_x == 1) { - n1 = n & -16; - if ( n1 > 0 ) - { + n1 = n & -16; + if (n1 > 0) { - sumf = zasum_kernel_16(n1, x); - i=n1; - ip=2*n1; - } - - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = zasum_kernel_16(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index f0e993d2f..31549849d 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,144 +27,136 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepg %%v0,0(%3) \n\t" - "vleg %%v1,8(%3),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%3),1 \n\t" -#else - "vleg %%v0,0(%3),1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,0(%3),0 \n\t" - "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" +#else + "vleg %%v0,0(%[alpha]),1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,0(%[alpha]),0\n\t" + "vlrepg %%v1,8(%[alpha])\n\t" #endif - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vpdi %%v24,%%v8,%%v8,4\n\t" + "vpdi %%v25,%%v9,%%v9,4\n\t" + "vpdi %%v26,%%v10,%%v10,4\n\t" + "vpdi %%v27,%%v11,%%v11,4\n\t" + "vpdi %%v28,%%v16,%%v16,4\n\t" + "vpdi %%v29,%%v17,%%v17,4\n\t" + "vpdi %%v30,%%v18,%%v18,4\n\t" + "vpdi %%v31,%%v19,%%v19,4\n\t" + "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -8; + BLASLONG n1 = n & -8; - if (n1) { - da[0] = da_r; - da[1] = da_i; - zaxpy_kernel_8(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 8c940bba3..2f80cedce 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,4 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zcopy_kernel_16(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -16; + if (n1 > 0) { + zcopy_kernel_16(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index aab18e2e9..7a67ef734 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,152 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v24,%%v24,%%v26 \n\t" - "vfadb %%v24,%%v24,%%v28 \n\t" - "vfadb %%v24,%%v24,%%v30 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vfadb %%v25,%%v25,%%v29 \n\t" - "vfadb %%v25,%%v25,%%v31 \n\t" - "vsteg %%v24,0(%3),0 \n\t" - "vsteg %%v24,8(%3),1 \n\t" - "vsteg %%v25,16(%3),1 \n\t" - "vsteg %%v25,24(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v25,%%v25,%%v27\n\t" + "vfadb %%v25,%%v25,%%v29\n\t" + "vfadb %%v25,%%v25,%%v31\n\t" + "vsteg %%v24,0(%[d]),0\n\t" + "vsteg %%v24,8(%[d]),1\n\t" + "vsteg %%v25,16(%[d]),1\n\t" + "vsteg %%v25,24(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -8; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - zdot_kernel_8(n1, x, y, dot); + BLASLONG n1 = n & -8; - i = n1; - BLASLONG j = i * 2; + if (n1) + zdot_kernel_8(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 9472b5d5a..7f21985ec 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,691 +25,632 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 1024 -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%5) \n\t" - "vl %%v17,16(%5) \n\t" - "vl %%v18,32(%5) \n\t" - "vl %%v19,48(%5) \n\t" +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%5),0 \n\t" - "wflcdb %%v20,%%v20 \n\t" - "vleg %%v20,0(%5),1 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "wflcdb %%v21,%%v21 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "wflcdb %%v22,%%v22 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vleg %%v23,56(%5),0 \n\t" - "wflcdb %%v23,%%v23 \n\t" - "vleg %%v23,48(%5),1 \n\t" + "vleg %%v20,8(%[x]),0\n\t" + "wflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "wflcdb %%v22,%%v22\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vleg %%v23,56(%[x]),0\n\t" + "wflcdb %%v23,%%v23\n\t" + "vleg %%v23,48(%[x]),1\n\t" #else - "vleg %%v20,0(%5),1 \n\t" - "vflcdb %%v20,%%v20 \n\t" - "vleg %%v20,8(%5),0 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vflcdb %%v21,%%v21 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vflcdb %%v22,%%v22 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "vleg %%v23,48(%5),1 \n\t" - "vflcdb %%v23,%%v23 \n\t" - "vleg %%v23,56(%5),0 \n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vflcdb %%v21,%%v21\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vflcdb %%v22,%%v22\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "vleg %%v23,48(%[x]),1\n\t" + "vflcdb %%v23,%%v23\n\t" + "vleg %%v23,56(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlrepg %%v24,0(%%r1,%1) \n\t" - "vlrepg %%v25,8(%%r1,%1) \n\t" - "vlrepg %%v26,0(%%r1,%2) \n\t" - "vlrepg %%v27,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,0(%%r1,%3) \n\t" - "vlrepg %%v29,8(%%r1,%3) \n\t" - "vlrepg %%v30,0(%%r1,%4) \n\t" - "vlrepg %%v31,8(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "vlrepg %%v24,16(%%r1,%1) \n\t" - "vlrepg %%v25,24(%%r1,%1) \n\t" - "vlrepg %%v26,16(%%r1,%2) \n\t" - "vlrepg %%v27,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,16(%%r1,%3) \n\t" - "vlrepg %%v29,24(%%r1,%3) \n\t" - "vlrepg %%v30,16(%%r1,%4) \n\t" - "vlrepg %%v31,24(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap0])\n\t" + "vlrepg %%v29,24(%%r1,%[ap0])\n\t" + "vlrepg %%v30,16(%%r1,%[ap1])\n\t" + "vlrepg %%v31,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" + "vlrepg %%v24,0(%%r1,%[ap2])\n\t" + "vlrepg %%v25,8(%%r1,%[ap2])\n\t" + "vlrepg %%v26,0(%%r1,%[ap3])\n\t" + "vlrepg %%v27,8(%%r1,%[ap3])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%3) \n\t" - "vl %%v17,16(%3) \n\t" +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%3),0 \n\t" - "wflcdb %%v18,%%v18 \n\t" - "vleg %%v18,0(%3),1 \n\t" - "vleg %%v19,24(%3),0 \n\t" - "wflcdb %%v19,%%v19 \n\t" - "vleg %%v19,16(%3),1 \n\t" + "vleg %%v18,8(%[x]),0\n\t" + "wflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vleg %%v19,24(%[x]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,16(%[x]),1\n\t" #else - "vleg %%v18,0(%3),1 \n\t" - "vflcdb %%v18,%%v18 \n\t" - "vleg %%v18,8(%3),0 \n\t" - "vleg %%v19,16(%3),1 \n\t" - "vflcdb %%v19,%%v19 \n\t" - "vleg %%v19,24(%3),0 \n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "vleg %%v19,16(%[x]),1\n\t" + "vflcdb %%v19,%%v19\n\t" + "vleg %%v19,24(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlrepg %%v20,0(%%r1,%1) \n\t" - "vlrepg %%v21,8(%%r1,%1) \n\t" - "vlrepg %%v22,0(%%r1,%2) \n\t" - "vlrepg %%v23,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "vlrepg %%v20,16(%%r1,%1) \n\t" - "vlrepg %%v21,24(%%r1,%1) \n\t" - "vlrepg %%v22,16(%%r1,%2) \n\t" - "vlrepg %%v23,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27"); } -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%2) \n\t" +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%2),0 \n\t" - "wflcdb %%v17,%%v17 \n\t" - "vleg %%v17,0(%2),1 \n\t" + "vleg %%v17,8(%[x]),0\n\t" + "wflcdb %%v17,%%v17\n\t" + "vleg %%v17,0(%[x]),1\n\t" #else - "vleg %%v17,0(%2),1 \n\t" - "vflcdb %%v17,%%v17 \n\t" - "vleg %%v17,8(%2),0 \n\t" + "vleg %%v17,0(%[x]),1\n\t" + "vflcdb %%v17,%%v17\n\t" + "vleg %%v17,8(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlrepg %%v18,0(%%r1,%1) \n\t" - "vlrepg %%v19,8(%%r1,%1) \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "vlrepg %%v18,16(%%r1,%1) \n\t" - "vlrepg %%v19,24(%%r1,%1) \n\t" - - "vl %%v0,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vlrepg %%v20,16(%%r1,%[ap])\n\t" + "vlrepg %%v21,24(%%r1,%[ap])\n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepg %%v0,%3 \n\t" - "vleg %%v1,%4,0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,%4,1 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepg %%v0,%[alpha_r]\n\t" + "vleg %%v1,%[alpha_i],0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,%[alpha_i],1\n\t" #else - "vleg %%v0,%3,1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,%3,0 \n\t" - "vlrepg %%v1,%4 \n\t" + "vleg %%v0,%[alpha_r],1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,%[alpha_r],0\n\t" + "vlrepg %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - if ( m3 == 1 ) - { + if (m3 == 1) { - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 75027a06c..aa7f16605 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - zrot_kernel_16(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + zrot_kernel_16(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4d8ee960f..fbcc0c5b9 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,426 +27,396 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "vleg %%v1,8(%1),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - "vpdi %%v28,%%v20,%%v20,4 \n\t" - "vpdi %%v29,%%v21,%%v21,4 \n\t" - "vpdi %%v30,%%v22,%%v22,4 \n\t" - "vpdi %%v31,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vleg %%v0,8(%1),0 \n\t" - "wflcdb %%v0,%%v0 \n\t" - "vleg %%v0,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v16,%%v16,%%v16,4 \n\t" - "vpdi %%v17,%%v17,%%v17,4 \n\t" - "vpdi %%v18,%%v18,%%v18,4 \n\t" - "vpdi %%v19,%%v19,%%v19,4 \n\t" - "vpdi %%v20,%%v20,%%v20,4 \n\t" - "vpdi %%v21,%%v21,%%v21,4 \n\t" - "vpdi %%v22,%%v22,%%v22,4 \n\t" - "vpdi %%v23,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vpdi %%v28,%%v20,%%v20,4\n\t" + "vpdi %%v29,%%v21,%%v21,4\n\t" + "vpdi %%v30,%%v22,%%v22,4\n\t" + "vpdi %%v31,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vleg %%v0,8(%[alpha]),0\n\t" + "wflcdb %%v0,%%v0\n\t" + "vleg %%v0,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v16,%%v16,%%v16,4\n\t" + "vpdi %%v17,%%v17,%%v17,4\n\t" + "vpdi %%v18,%%v18,%%v18,4\n\t" + "vpdi %%v19,%%v19,%%v19,4\n\t" + "vpdi %%v20,%%v20,%%v20,4\n\t" + "vpdi %%v21,%%v21,%%v21,4\n\t" + "vpdi %%v22,%%v22,%%v22,4\n\t" + "vpdi %%v23,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -8; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -8; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - zscal_kernel_8_zero(n1, x); - else - zscal_kernel_8_zero_r(n1, alpha, x); - else - if (da_i == 0) - zscal_kernel_8_zero_i(n1, alpha, x); - else - zscal_kernel_8(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + zscal_kernel_8_zero(n1, x); + else + zscal_kernel_8_zero_r(n1, alpha, x); + else if (da_i == 0) + zscal_kernel_8_zero_i(n1, alpha, x); + else + zscal_kernel_8(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index a16b87cdc..0f38103be 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zswap_kernel_16(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + zswap_kernel_16(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} From 61526480f906c2d9b4c6a5d2d28be21d0f96ca62 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 07:51:19 +0200 Subject: [PATCH 441/935] [ZARCH] Fix copy constraint --- kernel/zarch/ccopy.c | 2 +- kernel/zarch/dcopy.c | 2 +- kernel/zarch/scopy.c | 2 +- kernel/zarch/zcopy.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index 1b93a812e..d17bddcc8 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -36,7 +36,7 @@ static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index f7cbf54b2..b6a740c43 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -36,7 +36,7 @@ static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 44d27b062..4e4993737 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -36,7 +36,7 @@ static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 2f80cedce..50ff18646 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -36,7 +36,7 @@ static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } From f4b82d7bc4c20da29c19b2eece602002bd5fe4af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 13:30:13 +0100 Subject: [PATCH 442/935] Include complex rather than complex.h in C++ contexts to avoid name clashes e.g. with boost headers that use I as a generic placeholder. Fixes #1992 as suggested by aprokop in that issue ticket. --- lapack-netlib/LAPACKE/include/lapacke.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6ded78c8b..11740e113 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -86,7 +86,11 @@ lapack_complex_float lapack_make_complex_float( float re, float im ); /* Complex type (double precision) */ #ifndef lapack_complex_double +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_double double _Complex #endif From 11a43e81161e5bd3f90e38a1127b1562406e85cd Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 19:17:08 +0200 Subject: [PATCH 443/935] [ZARCH] Set alignment hint for vl/vst --- kernel/zarch/damax.c | 34 ++--- kernel/zarch/damax_z13.c | 34 ++--- kernel/zarch/damin.c | 34 ++--- kernel/zarch/damin_z13.c | 34 ++--- kernel/zarch/dasum.c | 32 ++--- kernel/zarch/daxpy.c | 96 +++++++-------- kernel/zarch/ddot.c | 32 ++--- kernel/zarch/dgemv_n_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dgemv_t_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dmax.c | 34 ++--- kernel/zarch/dmax_z13.c | 34 ++--- kernel/zarch/dmin.c | 34 ++--- kernel/zarch/dmin_z13.c | 34 ++--- kernel/zarch/drot.c | 128 +++++++++---------- kernel/zarch/dscal.c | 48 ++++---- kernel/zarch/dswap.c | 128 +++++++++---------- kernel/zarch/idamax.c | 34 ++--- kernel/zarch/idamin.c | 34 ++--- kernel/zarch/idmax.c | 34 ++--- kernel/zarch/idmin.c | 34 ++--- kernel/zarch/zasum.c | 32 ++--- kernel/zarch/zaxpy.c | 48 ++++---- kernel/zarch/zdot.c | 32 ++--- kernel/zarch/zgemv_n_4.c | 62 +++++----- kernel/zarch/zgemv_t_4.c | 40 +++--- kernel/zarch/zrot.c | 128 +++++++++---------- kernel/zarch/zscal.c | 112 ++++++++--------- kernel/zarch/zswap.c | 128 +++++++++---------- 28 files changed, 987 insertions(+), 987 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 37008f702..2598145c3 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 530d6e5bb..f7e11c3ce 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index a01791741..25f018c66 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 2172b6d6f..091aceb37 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 9f69a9931..641949963 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -45,14 +45,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 179ef8834..c02ad0aac 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -34,22 +34,22 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,0(%%r1,%[y]),3\n\t" + "vl %%v21,16(%%r1,%[y]),3\n\t" + "vl %%v22,32(%%r1,%[y]),3\n\t" + "vl %%v23,48(%%r1,%[y]),3\n\t" + "vl %%v24,64(%%r1,%[x]),3\n\t" + "vl %%v25,80(%%r1,%[x]),3\n\t" + "vl %%v26,96(%%r1,%[x]),3\n\t" + "vl %%v27,112(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -58,30 +58,30 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" + "vst %%v16,0(%%r1,%[y]),3\n\t" + "vst %%v17,16(%%r1,%[y]),3\n\t" + "vst %%v18,32(%%r1,%[y]),3\n\t" + "vst %%v19,48(%%r1,%[y]),3\n\t" + "vst %%v24,64(%%r1,%[y]),3\n\t" + "vst %%v25,80(%%r1,%[y]),3\n\t" + "vst %%v26,96(%%r1,%[y]),3\n\t" + "vst %%v27,112(%%r1,%[y]),3\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,128(%%r1,%[y]),3\n\t" + "vl %%v21,144(%%r1,%[y]),3\n\t" + "vl %%v22,160(%%r1,%[y]),3\n\t" + "vl %%v23,176(%%r1,%[y]),3\n\t" + "vl %%v24,192(%%r1,%[x]),3\n\t" + "vl %%v25,208(%%r1,%[x]),3\n\t" + "vl %%v26,224(%%r1,%[x]),3\n\t" + "vl %%v27,240(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[y]),3\n\t" + "vl %%v29,208(%%r1,%[y]),3\n\t" + "vl %%v30,224(%%r1,%[y]),3\n\t" + "vl %%v31,240(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -90,14 +90,14 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" + "vst %%v16,128(%%r1,%[y]),3\n\t" + "vst %%v17,144(%%r1,%[y]),3\n\t" + "vst %%v18,160(%%r1,%[y]),3\n\t" + "vst %%v19,176(%%r1,%[y]),3\n\t" + "vst %%v24,192(%%r1,%[y]),3\n\t" + "vst %%v25,208(%%r1,%[y]),3\n\t" + "vst %%v26,224(%%r1,%[y]),3\n\t" + "vst %%v27,240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f5f601717..0dd8ed08a 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -43,22 +43,22 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[x])\n\t" "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index c93ff9b54..87ed6ecd1 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -52,26 +52,26 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" + "vl %%v6,32(%%r1,%[y]),3\n\t" + "vl %%v7,48(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -88,30 +88,30 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" + "vst %%v6,32(%%r1,%[y]),3\n\t" + "vst %%v7,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[ap0]),3\n\t" + "vl %%v17,64(%%r1,%[ap1]),3\n\t" + "vl %%v18,64(%%r1,%[ap2]),3\n\t" + "vl %%v19,64(%%r1,%[ap3]),3\n\t" + "vl %%v20,80(%%r1,%[ap0]),3\n\t" + "vl %%v21,80(%%r1,%[ap1]),3\n\t" + "vl %%v22,80(%%r1,%[ap2]),3\n\t" + "vl %%v23,80(%%r1,%[ap3]),3\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" + "vl %%v4,64(%%r1,%[y]),3\n\t" + "vl %%v5,80(%%r1,%[y]),3\n\t" + "vl %%v6,96(%%r1,%[y]),3\n\t" + "vl %%v7,112(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -128,10 +128,10 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,64(%%r1,%[y]),3\n\t" + "vst %%v5,80(%%r1,%[y]),3\n\t" + "vst %%v6,96(%%r1,%[y]),3\n\t" + "vst %%v7,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -141,16 +141,16 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" @@ -159,8 +159,8 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -193,30 +193,30 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v20,32(%%r1,%[ap0]),3\n\t" + "vl %%v21,32(%%r1,%[ap1]),3\n\t" + "vl %%v22,48(%%r1,%[ap0]),3\n\t" + "vl %%v23,48(%%r1,%[ap1]),3\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" + "vl %%v4,32(%%r1,%[y]),3\n\t" + "vl %%v5,48(%%r1,%[y]),3\n\t" + "vl %%v6,64(%%r1,%[y]),3\n\t" + "vl %%v7,80(%%r1,%[y]),3\n\t" + "vl %%v8,96(%%r1,%[y]),3\n\t" + "vl %%v9,112(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" @@ -233,14 +233,14 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" + "vst %%v4,32(%%r1,%[y]),3\n\t" + "vst %%v5,48(%%r1,%[y]),3\n\t" + "vst %%v6,64(%%r1,%[y]),3\n\t" + "vst %%v7,80(%%r1,%[y]),3\n\t" + "vst %%v8,96(%%r1,%[y]),3\n\t" + "vst %%v9,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -250,18 +250,18 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -289,22 +289,22 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,32(%%r1,%[a0]),3\n\t" + "vl %%v19,48(%%r1,%[a0]),3\n\t" + "vl %%v20,64(%%r1,%[a0]),3\n\t" + "vl %%v21,80(%%r1,%[a0]),3\n\t" + "vl %%v22,96(%%r1,%[a0]),3\n\t" + "vl %%v23,112(%%r1,%[a0]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" @@ -313,14 +313,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" + "vst %%v24,0(%%r1,%[y]),3\n\t" + "vst %%v25,16(%%r1,%[y]),3\n\t" + "vst %%v26,32(%%r1,%[y]),3\n\t" + "vst %%v27,48(%%r1,%[y]),3\n\t" + "vst %%v28,64(%%r1,%[y]),3\n\t" + "vst %%v29,80(%%r1,%[y]),3\n\t" + "vst %%v30,96(%%r1,%[y]),3\n\t" + "vst %%v31,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -330,14 +330,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,0(%%r1,%[y]),3\n\t" + "vl %%v19,16(%%r1,%[y]),3\n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" + "vst %%v18,0(%%r1,%[y]),3\n\t" + "vst %%v19,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 24680cf1b..9fd3c09d6 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -50,77 +50,77 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" + "vl %%v26,64(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" + "vl %%v27,64(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" + "vl %%v28,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" + "vl %%v29,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" + "vl %%v30,80(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" + "vl %%v31,80(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -131,23 +131,23 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -198,45 +198,45 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" + "vl %%v28,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" + "vl %%v29,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" + "vl %%v30,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" + "vl %%v31,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -247,15 +247,15 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -299,29 +299,29 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" + "vl %%v26,32(%%r1,%[a0]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" + "vl %%v27,48(%%r1,%[a0]),3\n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" + "vl %%v28,64(%%r1,%[a0]),3\n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" + "vl %%v29,80(%%r1,%[a0]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" + "vl %%v30,96(%%r1,%[a0]),3\n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" + "vl %%v31,112(%%r1,%[a0]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -332,11 +332,11 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -378,38 +378,38 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,64(%%r1,%[src]),3\n\t" + "vl %%v21,80(%%r1,%[src]),3\n\t" + "vl %%v22,96(%%r1,%[src]),3\n\t" + "vl %%v23,112(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" + "vl %%v26, 32(%%r1,%[dest]),3\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" + "vst %%v26, 32(%%r1,%[dest]),3\n\t" + "vl %%v27, 48(%%r1,%[dest]),3\n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" + "vst %%v27, 48(%%r1,%[dest]),3\n\t" + "vl %%v28, 64(%%r1,%[dest]),3\n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" + "vst %%v28, 64(%%r1,%[dest]),3\n\t" + "vl %%v29, 80(%%r1,%[dest]),3\n\t" "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" + "vst %%v29, 80(%%r1,%[dest]),3\n\t" + "vl %%v30, 96(%%r1,%[dest]),3\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v30, 96(%%r1,%[dest]),3\n\t" + "vl %%v31, 112(%%r1,%[dest]),3\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v31, 112(%%r1,%[dest]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -419,14 +419,14 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 65ed31f01..cc0f23c87 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 87bccbe55..83d827d35 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" @@ -59,14 +59,14 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 518cc262c..754828b7c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 91561992f..ff0fca48c 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" @@ -59,14 +59,14 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 8f0197f02..de2207fcd 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -35,14 +35,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index c944990b5..bc58569d5 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -33,30 +33,30 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x]),3\n\t" "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" + "vst %%v24,0(%%r1,%[x]),3\n\t" + "vl %%v25,16(%%r1,%[x]),3\n\t" "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" + "vst %%v25,16(%%r1,%[x]),3\n\t" + "vl %%v26,32(%%r1,%[x]),3\n\t" "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" + "vst %%v26,32(%%r1,%[x]),3\n\t" + "vl %%v27,48(%%r1,%[x]),3\n\t" "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" + "vst %%v27,48(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[x]),3\n\t" "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" + "vst %%v28,64(%%r1,%[x]),3\n\t" + "vl %%v29,80(%%r1,%[x]),3\n\t" "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" + "vst %%v29,80(%%r1,%[x]),3\n\t" + "vl %%v30,96(%%r1,%[x]),3\n\t" "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" + "vst %%v30,96(%%r1,%[x]),3\n\t" + "vl %%v31,112(%%r1,%[x]),3\n\t" "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" + "vst %%v31,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) @@ -71,14 +71,14 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 60ba40bd6..f4da46dc1 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -33,70 +33,70 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 8434c811f..bd0f18115 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 80a37e6c2..4884d1e3a 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 18cdba437..a6b95bf3e 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 02ca427e4..c3f36d964 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 43ae8ff8b..83e5e93c9 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -45,14 +45,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 31549849d..77bb09a2e 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -45,22 +45,22 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x]),3\n\t" + "vl %%v9,16(%%r1,%[x]),3\n\t" + "vl %%v10,32(%%r1,%[x]),3\n\t" + "vl %%v11,48(%%r1,%[x]),3\n\t" + "vl %%v12,0(%%r1,%[y]),3\n\t" + "vl %%v13,16(%%r1,%[y]),3\n\t" + "vl %%v14,32(%%r1,%[y]),3\n\t" + "vl %%v15,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[x]),3\n\t" + "vl %%v17,80(%%r1,%[x]),3\n\t" + "vl %%v18,96(%%r1,%[x]),3\n\t" + "vl %%v19,112(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[y]),3\n\t" + "vl %%v21,80(%%r1,%[y]),3\n\t" + "vl %%v22,96(%%r1,%[y]),3\n\t" + "vl %%v23,112(%%r1,%[y]),3\n\t" "vpdi %%v24,%%v8,%%v8,4\n\t" "vpdi %%v25,%%v9,%%v9,4\n\t" "vpdi %%v26,%%v10,%%v10,4\n\t" @@ -85,14 +85,14 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" + "vst %%v8,0(%%r1,%[y]),3\n\t" + "vst %%v9,16(%%r1,%[y]),3\n\t" + "vst %%v10,32(%%r1,%[y]),3\n\t" + "vst %%v11,48(%%r1,%[y]),3\n\t" + "vst %%v16,64(%%r1,%[y]),3\n\t" + "vst %%v17,80(%%r1,%[y]),3\n\t" + "vst %%v18,96(%%r1,%[y]),3\n\t" + "vst %%v19,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 7a67ef734..8cfbaadb8 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -41,14 +41,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" @@ -61,14 +61,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" + "vl %%v16, 64(%%r1,%[x]),3\n\t" + "vl %%v17, 80(%%r1,%[x]),3\n\t" + "vl %%v18, 96(%%r1,%[x]),3\n\t" + "vl %%v19, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 64(%%r1,%[y]),3\n\t" + "vl %%v1, 80(%%r1,%[y]),3\n\t" + "vl %%v2, 96(%%r1,%[y]),3\n\t" + "vl %%v3, 112(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 7f21985ec..4b64fc8a5 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" + "vl %%v18,32(%[x]),3\n\t" + "vl %%v19,48(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v20,8(%[x]),0\n\t" "wflcdb %%v20,%%v20\n\t" @@ -69,8 +69,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t" @@ -103,8 +103,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -119,8 +119,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v18,8(%[x]),0\n\t" "wflcdb %%v18,%%v18\n\t" @@ -142,8 +142,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t" @@ -160,8 +160,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -173,7 +173,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v17,8(%[x]),0\n\t" "wflcdb %%v17,%%v17\n\t" @@ -188,8 +188,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t" @@ -198,8 +198,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -227,14 +227,14 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,0(%%r1,%[dest]),3\n\t" + "vl %%v21,16(%%r1,%[dest]),3\n\t" + "vl %%v22,32(%%r1,%[dest]),3\n\t" + "vl %%v23,48(%%r1,%[dest]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -247,10 +247,10 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" + "vst %%v28,0(%%r1,%[dest]),3\n\t" + "vst %%v29,16(%%r1,%[dest]),3\n\t" + "vst %%v30,32(%%r1,%[dest]),3\n\t" + "vst %%v31,48(%%r1,%[dest]),3\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 7b3e6c1fc..429824dcf 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -73,7 +73,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -120,10 +120,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v24,0(%[alpha]),0\n\t" "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" + "vl %%v26,0(%[y]),3\n\t" + "vl %%v27,16(%[y]),3\n\t" + "vl %%v28,32(%[y]),3\n\t" + "vl %%v29,48(%[y]),3\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" @@ -132,10 +132,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" + "vst %%v26,0(%[y]),3\n\t" + "vst %%v27,16(%[y]),3\n\t" + "vst %%v28,32(%[y]),3\n\t" + "vst %%v29,48(%[y]),3" : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -160,7 +160,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -178,7 +178,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -213,14 +213,14 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v20,0(%[alpha]),0\n\t" "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" + "vl %%v22,0(%[y]),3\n\t" + "vl %%v23,16(%[y]),3\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" + "vst %%v22,0(%[y]),3\n\t" + "vst %%v23,16(%[y]),3\n\t" : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -239,7 +239,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -253,7 +253,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -282,10 +282,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vleg %%v18,0(%[alpha]),0\n\t" "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y])\n\t" + "vl %%v0,0(%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" + "vst %%v0,0(%[y]),3\n\t" : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index aa7f16605..ea81e4741 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -35,14 +35,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index fbcc0c5b9..7fd62a1ac 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -36,14 +36,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -68,14 +68,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -93,14 +93,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t" @@ -117,14 +117,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -139,14 +139,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t" @@ -155,14 +155,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -177,14 +177,14 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0f38103be..0252ab8db 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -33,70 +33,70 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) From 1391fc46d2c38bb74ed69b7a527ab8865161c915 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 19:29:33 +0100 Subject: [PATCH 444/935] fix second instance of complex.h for c++ as well --- lapack-netlib/LAPACKE/include/lapacke.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6ded78c8b..c5ea465e0 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -70,7 +70,11 @@ /* Complex type (single precision) */ #ifndef lapack_complex_float +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_float float _Complex #endif @@ -86,7 +90,11 @@ lapack_complex_float lapack_make_complex_float( float re, float im ); /* Complex type (double precision) */ #ifndef lapack_complex_double +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_double double _Complex #endif From d70ae3ab433bda46708f02bf74c03c861bfb546f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 20:06:34 +0100 Subject: [PATCH 445/935] Make c_check robust against old or incomplete perl installations by catching and working around failures to load modules, and avoiding object-oriented syntax in tempfile creation. Fixes #1989 --- c_check | 85 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/c_check b/c_check index 9dc237beb..38f9170ca 100644 --- a/c_check +++ b/c_check @@ -1,7 +1,7 @@ #!/usr/bin/perl -use File::Basename; -use File::Temp qw(tempfile); +#use File::Basename; +# use File::Temp qw(tempfile); # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); @@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "zarch" if ($hostarch eq "s390x"); -$tmpf = new File::Temp( UNLINK => 1 ); +#$tmpf = new File::Temp( UNLINK => 1 ); $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); @@ -31,12 +31,25 @@ if ($?) { $cross_suffix = ""; -if (dirname($compiler_name) ne ".") { - $cross_suffix .= dirname($compiler_name) . "/"; -} +eval "use File::Basename"; +if ($@){ + warn "could not load PERL module File::Basename, emulating its functionality"; + my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); + if ($dirnam ne ".") { + $cross_suffix .= $dirnam . "/"; + } + my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); + if ($basnam =~ /([^\s]*-)(.*)/) { + $cross_suffix .= $1; + } +} else { + if (dirname($compiler_name) ne ".") { + $cross_suffix .= dirname($compiler_name) . "/"; + } -if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { - $cross_suffix .= $1; + if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { + $cross_suffix .= $1; + } } $compiler = ""; @@ -171,20 +184,26 @@ if ($?) { $have_msa = 0; if (($architecture eq "mips") || ($architecture eq "mips64")) { - $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; - print $tmpf "#include \n\n"; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; - - $args = "$msa_flags -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); - system(@cmd) == 0; - if ($? != 0) { - $have_msa = 0; + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check MSA capatibility"; } else { - $have_msa = 1; + $tmpf = new File::Temp( UNLINK => 1 ); + $code = '"addvi.b $w0, $w1, 1"'; + $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + print $tmpf "#include \n\n"; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + + $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $have_msa = 0; + } else { + $have_msa = 1; + } + unlink("$tmpf.o"); } - unlink("$tmpf.o"); } $architecture = x86 if ($data =~ /ARCH_X86/); @@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { - $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); - system(@cmd) == 0; - if ($? != 0) { - $no_avx512 = 1; - } else { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; $no_avx512 = 0; + } else { +# $tmpf = new File::Temp( UNLINK => 1 ); + ($fh,$tmpf) = tempfile( UNLINK => 1 ); + $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; + $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_avx512 = 1; + } else { + $no_avx512 = 0; + } + unlink("tmpf.o"); } - unlink("tmpf.o"); } $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; From 5952e586ceaa7ea68376f1580c6c96edca55804b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 23:51:40 +0100 Subject: [PATCH 446/935] Support DYNAMIC_LIST option in cmake e.g. cmake -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST="NEHALEM;HASWELL;ZEN" .. original issue was #1639 --- cmake/arch.cmake | 3 +++ cmake/system.cmake | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 63fb86fa2..470ea2a8f 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -74,6 +74,9 @@ if (DYNAMIC_ARCH) if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) endif () + if (DYNAMIC_LIST) + set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) + endif () endif () if (NOT DYNAMIC_CORE) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4cee7bd18..7fda2adb9 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -187,6 +187,13 @@ if (DYNAMIC_ARCH) endif () endif () +if (DYNAMIC_LIST) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") + foreach(DCORE ${DYNAMIC_LIST}) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") + endforeach () +endif () + if (NO_LAPACK) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") #Disable LAPACK C interface From 70397701652743587a88b20837c3b6e2c1da74f0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 6 Feb 2019 20:11:44 +0200 Subject: [PATCH 447/935] [ZARCH] Undo the last commit --- kernel/zarch/damax.c | 34 ++--- kernel/zarch/damax_z13.c | 34 ++--- kernel/zarch/damin.c | 34 ++--- kernel/zarch/damin_z13.c | 34 ++--- kernel/zarch/dasum.c | 32 ++--- kernel/zarch/daxpy.c | 96 +++++++-------- kernel/zarch/ddot.c | 32 ++--- kernel/zarch/dgemv_n_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dgemv_t_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dmax.c | 34 ++--- kernel/zarch/dmax_z13.c | 34 ++--- kernel/zarch/dmin.c | 34 ++--- kernel/zarch/dmin_z13.c | 34 ++--- kernel/zarch/drot.c | 128 +++++++++---------- kernel/zarch/dscal.c | 48 ++++---- kernel/zarch/dswap.c | 128 +++++++++---------- kernel/zarch/idamax.c | 34 ++--- kernel/zarch/idamin.c | 34 ++--- kernel/zarch/idmax.c | 34 ++--- kernel/zarch/idmin.c | 34 ++--- kernel/zarch/zasum.c | 32 ++--- kernel/zarch/zaxpy.c | 48 ++++---- kernel/zarch/zdot.c | 32 ++--- kernel/zarch/zgemv_n_4.c | 62 +++++----- kernel/zarch/zgemv_t_4.c | 40 +++--- kernel/zarch/zrot.c | 128 +++++++++---------- kernel/zarch/zscal.c | 112 ++++++++--------- kernel/zarch/zswap.c | 128 +++++++++---------- 28 files changed, 987 insertions(+), 987 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 2598145c3..37008f702 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index f7e11c3ce..530d6e5bb 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 25f018c66..a01791741 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 091aceb37..2172b6d6f 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 641949963..9f69a9931 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -45,14 +45,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x]),3\n\t" - "vl %%v17, 144(%%r1,%[x]),3\n\t" - "vl %%v18, 160(%%r1,%[x]),3\n\t" - "vl %%v19, 176(%%r1,%[x]),3\n\t" - "vl %%v20, 192(%%r1,%[x]),3\n\t" - "vl %%v21, 208(%%r1,%[x]),3\n\t" - "vl %%v22, 224(%%r1,%[x]),3\n\t" - "vl %%v23, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index c02ad0aac..179ef8834 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -34,22 +34,22 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,0(%%r1,%[y]),3\n\t" - "vl %%v21,16(%%r1,%[y]),3\n\t" - "vl %%v22,32(%%r1,%[y]),3\n\t" - "vl %%v23,48(%%r1,%[y]),3\n\t" - "vl %%v24,64(%%r1,%[x]),3\n\t" - "vl %%v25,80(%%r1,%[x]),3\n\t" - "vl %%v26,96(%%r1,%[x]),3\n\t" - "vl %%v27,112(%%r1,%[x]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -58,30 +58,30 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y]),3\n\t" - "vst %%v17,16(%%r1,%[y]),3\n\t" - "vst %%v18,32(%%r1,%[y]),3\n\t" - "vst %%v19,48(%%r1,%[y]),3\n\t" - "vst %%v24,64(%%r1,%[y]),3\n\t" - "vst %%v25,80(%%r1,%[y]),3\n\t" - "vst %%v26,96(%%r1,%[y]),3\n\t" - "vst %%v27,112(%%r1,%[y]),3\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,128(%%r1,%[y]),3\n\t" - "vl %%v21,144(%%r1,%[y]),3\n\t" - "vl %%v22,160(%%r1,%[y]),3\n\t" - "vl %%v23,176(%%r1,%[y]),3\n\t" - "vl %%v24,192(%%r1,%[x]),3\n\t" - "vl %%v25,208(%%r1,%[x]),3\n\t" - "vl %%v26,224(%%r1,%[x]),3\n\t" - "vl %%v27,240(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[y]),3\n\t" - "vl %%v29,208(%%r1,%[y]),3\n\t" - "vl %%v30,224(%%r1,%[y]),3\n\t" - "vl %%v31,240(%%r1,%[y]),3\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -90,14 +90,14 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y]),3\n\t" - "vst %%v17,144(%%r1,%[y]),3\n\t" - "vst %%v18,160(%%r1,%[y]),3\n\t" - "vst %%v19,176(%%r1,%[y]),3\n\t" - "vst %%v24,192(%%r1,%[y]),3\n\t" - "vst %%v25,208(%%r1,%[y]),3\n\t" - "vst %%v26,224(%%r1,%[y]),3\n\t" - "vst %%v27,240(%%r1,%[y]),3\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index 0dd8ed08a..f5f601717 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -43,22 +43,22 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[x])\n\t" "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[y]),3\n\t" - "vl %%v25,16(%%r1,%[y]),3\n\t" - "vl %%v26,32(%%r1,%[y]),3\n\t" - "vl %%v27,48(%%r1,%[y]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index 87ed6ecd1..c93ff9b54 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -52,26 +52,26 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,0(%%r1,%[ap2]),3\n\t" - "vl %%v19,0(%%r1,%[ap3]),3\n\t" - "vl %%v20,16(%%r1,%[ap0]),3\n\t" - "vl %%v21,16(%%r1,%[ap1]),3\n\t" - "vl %%v22,16(%%r1,%[ap2]),3\n\t" - "vl %%v23,16(%%r1,%[ap3]),3\n\t" - "vl %%v24,32(%%r1,%[ap0]),3\n\t" - "vl %%v25,32(%%r1,%[ap1]),3\n\t" - "vl %%v26,32(%%r1,%[ap2]),3\n\t" - "vl %%v27,32(%%r1,%[ap3]),3\n\t" - "vl %%v28,48(%%r1,%[ap0]),3\n\t" - "vl %%v29,48(%%r1,%[ap1]),3\n\t" - "vl %%v30,48(%%r1,%[ap2]),3\n\t" - "vl %%v31,48(%%r1,%[ap3]),3\n\t" - "vl %%v4,0(%%r1,%[y]),3\n\t" - "vl %%v5,16(%%r1,%[y]),3\n\t" - "vl %%v6,32(%%r1,%[y]),3\n\t" - "vl %%v7,48(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -88,30 +88,30 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y]),3\n\t" - "vst %%v5,16(%%r1,%[y]),3\n\t" - "vst %%v6,32(%%r1,%[y]),3\n\t" - "vst %%v7,48(%%r1,%[y]),3\n\t" - "vl %%v16,64(%%r1,%[ap0]),3\n\t" - "vl %%v17,64(%%r1,%[ap1]),3\n\t" - "vl %%v18,64(%%r1,%[ap2]),3\n\t" - "vl %%v19,64(%%r1,%[ap3]),3\n\t" - "vl %%v20,80(%%r1,%[ap0]),3\n\t" - "vl %%v21,80(%%r1,%[ap1]),3\n\t" - "vl %%v22,80(%%r1,%[ap2]),3\n\t" - "vl %%v23,80(%%r1,%[ap3]),3\n\t" - "vl %%v24,96(%%r1,%[ap0]),3\n\t" - "vl %%v25,96(%%r1,%[ap1]),3\n\t" - "vl %%v26,96(%%r1,%[ap2]),3\n\t" - "vl %%v27,96(%%r1,%[ap3]),3\n\t" - "vl %%v28,112(%%r1,%[ap0]),3\n\t" - "vl %%v29,112(%%r1,%[ap1]),3\n\t" - "vl %%v30,112(%%r1,%[ap2]),3\n\t" - "vl %%v31,112(%%r1,%[ap3]),3\n\t" - "vl %%v4,64(%%r1,%[y]),3\n\t" - "vl %%v5,80(%%r1,%[y]),3\n\t" - "vl %%v6,96(%%r1,%[y]),3\n\t" - "vl %%v7,112(%%r1,%[y]),3\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -128,10 +128,10 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y]),3\n\t" - "vst %%v5,80(%%r1,%[y]),3\n\t" - "vst %%v6,96(%%r1,%[y]),3\n\t" - "vst %%v7,112(%%r1,%[y]),3\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -141,16 +141,16 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,0(%%r1,%[ap2]),3\n\t" - "vl %%v19,0(%%r1,%[ap3]),3\n\t" - "vl %%v20,16(%%r1,%[ap0]),3\n\t" - "vl %%v21,16(%%r1,%[ap1]),3\n\t" - "vl %%v22,16(%%r1,%[ap2]),3\n\t" - "vl %%v23,16(%%r1,%[ap3]),3\n\t" - "vl %%v4,0(%%r1,%[y]),3\n\t" - "vl %%v5,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" @@ -159,8 +159,8 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y]),3\n\t" - "vst %%v5,16(%%r1,%[y]),3\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -193,30 +193,30 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,16(%%r1,%[ap0]),3\n\t" - "vl %%v19,16(%%r1,%[ap1]),3\n\t" - "vl %%v20,32(%%r1,%[ap0]),3\n\t" - "vl %%v21,32(%%r1,%[ap1]),3\n\t" - "vl %%v22,48(%%r1,%[ap0]),3\n\t" - "vl %%v23,48(%%r1,%[ap1]),3\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" - "vl %%v26,80(%%r1,%[ap0]),3\n\t" - "vl %%v27,80(%%r1,%[ap1]),3\n\t" - "vl %%v28,96(%%r1,%[ap0]),3\n\t" - "vl %%v29,96(%%r1,%[ap1]),3\n\t" - "vl %%v30,112(%%r1,%[ap0]),3\n\t" - "vl %%v31,112(%%r1,%[ap1]),3\n\t" - "vl %%v2,0(%%r1,%[y]),3\n\t" - "vl %%v3,16(%%r1,%[y]),3\n\t" - "vl %%v4,32(%%r1,%[y]),3\n\t" - "vl %%v5,48(%%r1,%[y]),3\n\t" - "vl %%v6,64(%%r1,%[y]),3\n\t" - "vl %%v7,80(%%r1,%[y]),3\n\t" - "vl %%v8,96(%%r1,%[y]),3\n\t" - "vl %%v9,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" @@ -233,14 +233,14 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y]),3\n\t" - "vst %%v3,16(%%r1,%[y]),3\n\t" - "vst %%v4,32(%%r1,%[y]),3\n\t" - "vst %%v5,48(%%r1,%[y]),3\n\t" - "vst %%v6,64(%%r1,%[y]),3\n\t" - "vst %%v7,80(%%r1,%[y]),3\n\t" - "vst %%v8,96(%%r1,%[y]),3\n\t" - "vst %%v9,112(%%r1,%[y]),3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -250,18 +250,18 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,16(%%r1,%[ap0]),3\n\t" - "vl %%v19,16(%%r1,%[ap1]),3\n\t" - "vl %%v2,0(%%r1,%[y]),3\n\t" - "vl %%v3,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y]),3\n\t" - "vst %%v3,16(%%r1,%[y]),3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -289,22 +289,22 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0]),3\n\t" - "vl %%v17,16(%%r1,%[a0]),3\n\t" - "vl %%v18,32(%%r1,%[a0]),3\n\t" - "vl %%v19,48(%%r1,%[a0]),3\n\t" - "vl %%v20,64(%%r1,%[a0]),3\n\t" - "vl %%v21,80(%%r1,%[a0]),3\n\t" - "vl %%v22,96(%%r1,%[a0]),3\n\t" - "vl %%v23,112(%%r1,%[a0]),3\n\t" - "vl %%v24,0(%%r1,%[y]),3\n\t" - "vl %%v25,16(%%r1,%[y]),3\n\t" - "vl %%v26,32(%%r1,%[y]),3\n\t" - "vl %%v27,48(%%r1,%[y]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" @@ -313,14 +313,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y]),3\n\t" - "vst %%v25,16(%%r1,%[y]),3\n\t" - "vst %%v26,32(%%r1,%[y]),3\n\t" - "vst %%v27,48(%%r1,%[y]),3\n\t" - "vst %%v28,64(%%r1,%[y]),3\n\t" - "vst %%v29,80(%%r1,%[y]),3\n\t" - "vst %%v30,96(%%r1,%[y]),3\n\t" - "vst %%v31,112(%%r1,%[y]),3\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -330,14 +330,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[a0]),3\n\t" - "vl %%v17,16(%%r1,%[a0]),3\n\t" - "vl %%v18,0(%%r1,%[y]),3\n\t" - "vl %%v19,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y]),3\n\t" - "vst %%v19,16(%%r1,%[y]),3\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 9fd3c09d6..24680cf1b 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -50,77 +50,77 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2]),3\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3]),3\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0]),3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1]),3\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2]),3\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3]),3\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0]),3\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1]),3\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2]),3\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3]),3\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0]),3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1]),3\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2]),3\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3]),3\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2]),3\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3]),3\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0]),3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1]),3\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2]),3\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3]),3\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0]),3\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1]),3\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2]),3\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3]),3\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0]),3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1]),3\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2]),3\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3]),3\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -131,23 +131,23 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2]),3\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3]),3\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0]),3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1]),3\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2]),3\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3]),3\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -198,45 +198,45 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0]),3\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1]),3\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0]),3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1]),3\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0]),3\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1]),3\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0]),3\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1]),3\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0]),3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1]),3\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0]),3\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1]),3\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -247,15 +247,15 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0]),3\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1]),3\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -299,29 +299,29 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[a0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0]),3\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0]),3\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0]),3\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0]),3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0]),3\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0]),3\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0]),3\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -332,11 +332,11 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[a0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0]),3\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -378,38 +378,38 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v18,32(%%r1,%[src]),3\n\t" - "vl %%v19,48(%%r1,%[src]),3\n\t" - "vl %%v20,64(%%r1,%[src]),3\n\t" - "vl %%v21,80(%%r1,%[src]),3\n\t" - "vl %%v22,96(%%r1,%[src]),3\n\t" - "vl %%v23,112(%%r1,%[src]),3\n\t" - "vl %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest]),3\n\t" - "vl %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest]),3\n\t" - "vl %%v26, 32(%%r1,%[dest]),3\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest]),3\n\t" - "vl %%v27, 48(%%r1,%[dest]),3\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest]),3\n\t" - "vl %%v28, 64(%%r1,%[dest]),3\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest]),3\n\t" - "vl %%v29, 80(%%r1,%[dest]),3\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest]),3\n\t" - "vl %%v30, 96(%%r1,%[dest]),3\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest]),3\n\t" - "vl %%v31, 112(%%r1,%[dest]),3\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest]),3\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -419,14 +419,14 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest]),3\n\t" - "vl %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index cc0f23c87..65ed31f01 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 83d827d35..87bccbe55 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" @@ -59,14 +59,14 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 754828b7c..518cc262c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index ff0fca48c..91561992f 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" @@ -59,14 +59,14 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index de2207fcd..8f0197f02 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -35,14 +35,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x]),3\n\t" - "vl %%v25, 16(%%r1,%[x]),3\n\t" - "vl %%v26, 32(%%r1,%[x]),3\n\t" - "vl %%v27, 48(%%r1,%[x]),3\n\t" - "vl %%v16, 0(%%r1,%[y]),3\n\t" - "vl %%v17, 16(%%r1,%[y]),3\n\t" - "vl %%v18, 32(%%r1,%[y]),3\n\t" - "vl %%v19, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x]),3\n\t" - "vst %%v29, 16(%%r1,%[x]),3\n\t" - "vst %%v30, 32(%%r1,%[x]),3\n\t" - "vst %%v31, 48(%%r1,%[x]),3\n\t" - "vst %%v20, 0(%%r1,%[y]),3\n\t" - "vst %%v21, 16(%%r1,%[y]),3\n\t" - "vst %%v22, 32(%%r1,%[y]),3\n\t" - "vst %%v23, 48(%%r1,%[y]),3\n\t" - "vl %%v24, 64(%%r1,%[x]),3\n\t" - "vl %%v25, 80(%%r1,%[x]),3\n\t" - "vl %%v26, 96(%%r1,%[x]),3\n\t" - "vl %%v27, 112(%%r1,%[x]),3\n\t" - "vl %%v16, 64(%%r1,%[y]),3\n\t" - "vl %%v17, 80(%%r1,%[y]),3\n\t" - "vl %%v18, 96(%%r1,%[y]),3\n\t" - "vl %%v19, 112(%%r1,%[y]),3\n\t" + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x]),3\n\t" - "vst %%v29, 80(%%r1,%[x]),3\n\t" - "vst %%v30, 96(%%r1,%[x]),3\n\t" - "vst %%v31, 112(%%r1,%[x]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v16, 128(%%r1,%[y]),3\n\t" - "vl %%v17, 144(%%r1,%[y]),3\n\t" - "vl %%v18, 160(%%r1,%[y]),3\n\t" - "vl %%v19, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x]),3\n\t" - "vst %%v29, 144(%%r1,%[x]),3\n\t" - "vst %%v30, 160(%%r1,%[x]),3\n\t" - "vst %%v31, 176(%%r1,%[x]),3\n\t" - "vst %%v20, 128(%%r1,%[y]),3\n\t" - "vst %%v21, 144(%%r1,%[y]),3\n\t" - "vst %%v22, 160(%%r1,%[y]),3\n\t" - "vst %%v23, 176(%%r1,%[y]),3\n\t" - "vl %%v24, 192(%%r1,%[x]),3\n\t" - "vl %%v25, 208(%%r1,%[x]),3\n\t" - "vl %%v26, 224(%%r1,%[x]),3\n\t" - "vl %%v27, 240(%%r1,%[x]),3\n\t" - "vl %%v16, 192(%%r1,%[y]),3\n\t" - "vl %%v17, 208(%%r1,%[y]),3\n\t" - "vl %%v18, 224(%%r1,%[y]),3\n\t" - "vl %%v19, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x]),3\n\t" - "vst %%v29, 208(%%r1,%[x]),3\n\t" - "vst %%v30, 224(%%r1,%[x]),3\n\t" - "vst %%v31, 240(%%r1,%[x]),3\n\t" - "vst %%v20, 192(%%r1,%[y]),3\n\t" - "vst %%v21, 208(%%r1,%[y]),3\n\t" - "vst %%v22, 224(%%r1,%[y]),3\n\t" - "vst %%v23, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index bc58569d5..c944990b5 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -33,30 +33,30 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x]),3\n\t" - "vl %%v25,16(%%r1,%[x]),3\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x]),3\n\t" - "vl %%v26,32(%%r1,%[x]),3\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x]),3\n\t" - "vl %%v27,48(%%r1,%[x]),3\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x]),3\n\t" - "vl %%v28,64(%%r1,%[x]),3\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x]),3\n\t" - "vl %%v29,80(%%r1,%[x]),3\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x]),3\n\t" - "vl %%v30,96(%%r1,%[x]),3\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x]),3\n\t" - "vl %%v31,112(%%r1,%[x]),3\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x]),3\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) @@ -71,14 +71,14 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x]),3\n\t" - "vst %%v0,16(%%r1,%[x]),3\n\t" - "vst %%v0,32(%%r1,%[x]),3\n\t" - "vst %%v0,48(%%r1,%[x]),3\n\t" - "vst %%v0,64(%%r1,%[x]),3\n\t" - "vst %%v0,80(%%r1,%[x]),3\n\t" - "vst %%v0,96(%%r1,%[x]),3\n\t" - "vst %%v0,112(%%r1,%[x]),3\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index f4da46dc1..60ba40bd6 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -33,70 +33,70 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v28, 192(%%r1,%[x]),3\n\t" - "vl %%v29, 208(%%r1,%[x]),3\n\t" - "vl %%v30, 224(%%r1,%[x]),3\n\t" - "vl %%v31, 240(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" - "vl %%v4, 64(%%r1,%[y]),3\n\t" - "vl %%v5, 80(%%r1,%[y]),3\n\t" - "vl %%v6, 96(%%r1,%[y]),3\n\t" - "vl %%v7, 112(%%r1,%[y]),3\n\t" - "vst %%v0, 0(%%r1,%[x]),3\n\t" - "vst %%v1, 16(%%r1,%[x]),3\n\t" - "vst %%v2, 32(%%r1,%[x]),3\n\t" - "vst %%v3, 48(%%r1,%[x]),3\n\t" - "vst %%v4, 64(%%r1,%[x]),3\n\t" - "vst %%v5, 80(%%r1,%[x]),3\n\t" - "vst %%v6, 96(%%r1,%[x]),3\n\t" - "vst %%v7, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 128(%%r1,%[y]),3\n\t" - "vl %%v1, 144(%%r1,%[y]),3\n\t" - "vl %%v2, 160(%%r1,%[y]),3\n\t" - "vl %%v3, 176(%%r1,%[y]),3\n\t" - "vl %%v4, 192(%%r1,%[y]),3\n\t" - "vl %%v5, 208(%%r1,%[y]),3\n\t" - "vl %%v6, 224(%%r1,%[y]),3\n\t" - "vl %%v7, 240(%%r1,%[y]),3\n\t" - "vst %%v0, 128(%%r1,%[x]),3\n\t" - "vst %%v1, 144(%%r1,%[x]),3\n\t" - "vst %%v2, 160(%%r1,%[x]),3\n\t" - "vst %%v3, 176(%%r1,%[x]),3\n\t" - "vst %%v4, 192(%%r1,%[x]),3\n\t" - "vst %%v5, 208(%%r1,%[x]),3\n\t" - "vst %%v6, 224(%%r1,%[x]),3\n\t" - "vst %%v7, 240(%%r1,%[x]),3\n\t" - "vst %%v16, 0(%%r1,%[y]),3\n\t" - "vst %%v17, 16(%%r1,%[y]),3\n\t" - "vst %%v18, 32(%%r1,%[y]),3\n\t" - "vst %%v19, 48(%%r1,%[y]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vst %%v24, 128(%%r1,%[y]),3\n\t" - "vst %%v25, 144(%%r1,%[y]),3\n\t" - "vst %%v26, 160(%%r1,%[y]),3\n\t" - "vst %%v27, 176(%%r1,%[y]),3\n\t" - "vst %%v28, 192(%%r1,%[y]),3\n\t" - "vst %%v29, 208(%%r1,%[y]),3\n\t" - "vst %%v30, 224(%%r1,%[y]),3\n\t" - "vst %%v31, 240(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index bd0f18115..8434c811f 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 4884d1e3a..80a37e6c2 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index a6b95bf3e..18cdba437 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index c3f36d964..02ca427e4 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 83e5e93c9..43ae8ff8b 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -45,14 +45,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x]),3\n\t" - "vl %%v17, 144(%%r1,%[x]),3\n\t" - "vl %%v18, 160(%%r1,%[x]),3\n\t" - "vl %%v19, 176(%%r1,%[x]),3\n\t" - "vl %%v20, 192(%%r1,%[x]),3\n\t" - "vl %%v21, 208(%%r1,%[x]),3\n\t" - "vl %%v22, 224(%%r1,%[x]),3\n\t" - "vl %%v23, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 77bb09a2e..31549849d 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -45,22 +45,22 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x]),3\n\t" - "vl %%v9,16(%%r1,%[x]),3\n\t" - "vl %%v10,32(%%r1,%[x]),3\n\t" - "vl %%v11,48(%%r1,%[x]),3\n\t" - "vl %%v12,0(%%r1,%[y]),3\n\t" - "vl %%v13,16(%%r1,%[y]),3\n\t" - "vl %%v14,32(%%r1,%[y]),3\n\t" - "vl %%v15,48(%%r1,%[y]),3\n\t" - "vl %%v16,64(%%r1,%[x]),3\n\t" - "vl %%v17,80(%%r1,%[x]),3\n\t" - "vl %%v18,96(%%r1,%[x]),3\n\t" - "vl %%v19,112(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[y]),3\n\t" - "vl %%v21,80(%%r1,%[y]),3\n\t" - "vl %%v22,96(%%r1,%[y]),3\n\t" - "vl %%v23,112(%%r1,%[y]),3\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" "vpdi %%v24,%%v8,%%v8,4\n\t" "vpdi %%v25,%%v9,%%v9,4\n\t" "vpdi %%v26,%%v10,%%v10,4\n\t" @@ -85,14 +85,14 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y]),3\n\t" - "vst %%v9,16(%%r1,%[y]),3\n\t" - "vst %%v10,32(%%r1,%[y]),3\n\t" - "vst %%v11,48(%%r1,%[y]),3\n\t" - "vst %%v16,64(%%r1,%[y]),3\n\t" - "vst %%v17,80(%%r1,%[y]),3\n\t" - "vst %%v18,96(%%r1,%[y]),3\n\t" - "vst %%v19,112(%%r1,%[y]),3\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 8cfbaadb8..7a67ef734 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -41,14 +41,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" @@ -61,14 +61,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x]),3\n\t" - "vl %%v17, 80(%%r1,%[x]),3\n\t" - "vl %%v18, 96(%%r1,%[x]),3\n\t" - "vl %%v19, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 64(%%r1,%[y]),3\n\t" - "vl %%v1, 80(%%r1,%[y]),3\n\t" - "vl %%v2, 96(%%r1,%[y]),3\n\t" - "vl %%v3, 112(%%r1,%[y]),3\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 4b64fc8a5..7f21985ec 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" - "vl %%v17,16(%[x]),3\n\t" - "vl %%v18,32(%[x]),3\n\t" - "vl %%v19,48(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v20,8(%[x]),0\n\t" "wflcdb %%v20,%%v20\n\t" @@ -69,8 +69,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t" @@ -103,8 +103,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -119,8 +119,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" - "vl %%v17,16(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v18,8(%[x]),0\n\t" "wflcdb %%v18,%%v18\n\t" @@ -142,8 +142,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t" @@ -160,8 +160,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -173,7 +173,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v17,8(%[x]),0\n\t" "wflcdb %%v17,%%v17\n\t" @@ -188,8 +188,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t" @@ -198,8 +198,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -227,14 +227,14 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v18,32(%%r1,%[src]),3\n\t" - "vl %%v19,48(%%r1,%[src]),3\n\t" - "vl %%v20,0(%%r1,%[dest]),3\n\t" - "vl %%v21,16(%%r1,%[dest]),3\n\t" - "vl %%v22,32(%%r1,%[dest]),3\n\t" - "vl %%v23,48(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -247,10 +247,10 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest]),3\n\t" - "vst %%v29,16(%%r1,%[dest]),3\n\t" - "vst %%v30,32(%%r1,%[dest]),3\n\t" - "vst %%v31,48(%%r1,%[dest]),3\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 429824dcf..7b3e6c1fc 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -73,7 +73,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -120,10 +120,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v24,0(%[alpha]),0\n\t" "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y]),3\n\t" - "vl %%v27,16(%[y]),3\n\t" - "vl %%v28,32(%[y]),3\n\t" - "vl %%v29,48(%[y]),3\n\t" + "vl %%v26,0(%[y])\n\t" + "vl %%v27,16(%[y])\n\t" + "vl %%v28,32(%[y])\n\t" + "vl %%v29,48(%[y])\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" @@ -132,10 +132,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y]),3\n\t" - "vst %%v27,16(%[y]),3\n\t" - "vst %%v28,32(%[y]),3\n\t" - "vst %%v29,48(%[y]),3" + "vst %%v26,0(%[y])\n\t" + "vst %%v27,16(%[y])\n\t" + "vst %%v28,32(%[y])\n\t" + "vst %%v29,48(%[y])" : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -160,7 +160,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -178,7 +178,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -213,14 +213,14 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v20,0(%[alpha]),0\n\t" "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y]),3\n\t" - "vl %%v23,16(%[y]),3\n\t" + "vl %%v22,0(%[y])\n\t" + "vl %%v23,16(%[y])\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y]),3\n\t" - "vst %%v23,16(%[y]),3\n\t" + "vst %%v22,0(%[y])\n\t" + "vst %%v23,16(%[y])\n\t" : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -239,7 +239,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -253,7 +253,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -282,10 +282,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vleg %%v18,0(%[alpha]),0\n\t" "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y]),3\n\t" + "vl %%v0,0(%[y])\n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y]),3\n\t" + "vst %%v0,0(%[y])\n\t" : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index ea81e4741..aa7f16605 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -35,14 +35,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x]),3\n\t" - "vl %%v25, 16(%%r1,%[x]),3\n\t" - "vl %%v26, 32(%%r1,%[x]),3\n\t" - "vl %%v27, 48(%%r1,%[x]),3\n\t" - "vl %%v16, 0(%%r1,%[y]),3\n\t" - "vl %%v17, 16(%%r1,%[y]),3\n\t" - "vl %%v18, 32(%%r1,%[y]),3\n\t" - "vl %%v19, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x]),3\n\t" - "vst %%v29, 16(%%r1,%[x]),3\n\t" - "vst %%v30, 32(%%r1,%[x]),3\n\t" - "vst %%v31, 48(%%r1,%[x]),3\n\t" - "vst %%v20, 0(%%r1,%[y]),3\n\t" - "vst %%v21, 16(%%r1,%[y]),3\n\t" - "vst %%v22, 32(%%r1,%[y]),3\n\t" - "vst %%v23, 48(%%r1,%[y]),3\n\t" - "vl %%v24, 64(%%r1,%[x]),3\n\t" - "vl %%v25, 80(%%r1,%[x]),3\n\t" - "vl %%v26, 96(%%r1,%[x]),3\n\t" - "vl %%v27, 112(%%r1,%[x]),3\n\t" - "vl %%v16, 64(%%r1,%[y]),3\n\t" - "vl %%v17, 80(%%r1,%[y]),3\n\t" - "vl %%v18, 96(%%r1,%[y]),3\n\t" - "vl %%v19, 112(%%r1,%[y]),3\n\t" + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x]),3\n\t" - "vst %%v29, 80(%%r1,%[x]),3\n\t" - "vst %%v30, 96(%%r1,%[x]),3\n\t" - "vst %%v31, 112(%%r1,%[x]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v16, 128(%%r1,%[y]),3\n\t" - "vl %%v17, 144(%%r1,%[y]),3\n\t" - "vl %%v18, 160(%%r1,%[y]),3\n\t" - "vl %%v19, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x]),3\n\t" - "vst %%v29, 144(%%r1,%[x]),3\n\t" - "vst %%v30, 160(%%r1,%[x]),3\n\t" - "vst %%v31, 176(%%r1,%[x]),3\n\t" - "vst %%v20, 128(%%r1,%[y]),3\n\t" - "vst %%v21, 144(%%r1,%[y]),3\n\t" - "vst %%v22, 160(%%r1,%[y]),3\n\t" - "vst %%v23, 176(%%r1,%[y]),3\n\t" - "vl %%v24, 192(%%r1,%[x]),3\n\t" - "vl %%v25, 208(%%r1,%[x]),3\n\t" - "vl %%v26, 224(%%r1,%[x]),3\n\t" - "vl %%v27, 240(%%r1,%[x]),3\n\t" - "vl %%v16, 192(%%r1,%[y]),3\n\t" - "vl %%v17, 208(%%r1,%[y]),3\n\t" - "vl %%v18, 224(%%r1,%[y]),3\n\t" - "vl %%v19, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x]),3\n\t" - "vst %%v29, 208(%%r1,%[x]),3\n\t" - "vst %%v30, 224(%%r1,%[x]),3\n\t" - "vst %%v31, 240(%%r1,%[x]),3\n\t" - "vst %%v20, 192(%%r1,%[y]),3\n\t" - "vst %%v21, 208(%%r1,%[y]),3\n\t" - "vst %%v22, 224(%%r1,%[y]),3\n\t" - "vst %%v23, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 7fd62a1ac..fbcc0c5b9 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -36,14 +36,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -68,14 +68,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -93,14 +93,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t" @@ -117,14 +117,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -139,14 +139,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t" @@ -155,14 +155,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -177,14 +177,14 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x]),3\n\t" - "vst %%v0,16(%%r1,%[x]),3\n\t" - "vst %%v0,32(%%r1,%[x]),3\n\t" - "vst %%v0,48(%%r1,%[x]),3\n\t" - "vst %%v0,64(%%r1,%[x]),3\n\t" - "vst %%v0,80(%%r1,%[x]),3\n\t" - "vst %%v0,96(%%r1,%[x]),3\n\t" - "vst %%v0,112(%%r1,%[x]),3\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0252ab8db..0f38103be 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -33,70 +33,70 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v28, 192(%%r1,%[x]),3\n\t" - "vl %%v29, 208(%%r1,%[x]),3\n\t" - "vl %%v30, 224(%%r1,%[x]),3\n\t" - "vl %%v31, 240(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" - "vl %%v4, 64(%%r1,%[y]),3\n\t" - "vl %%v5, 80(%%r1,%[y]),3\n\t" - "vl %%v6, 96(%%r1,%[y]),3\n\t" - "vl %%v7, 112(%%r1,%[y]),3\n\t" - "vst %%v0, 0(%%r1,%[x]),3\n\t" - "vst %%v1, 16(%%r1,%[x]),3\n\t" - "vst %%v2, 32(%%r1,%[x]),3\n\t" - "vst %%v3, 48(%%r1,%[x]),3\n\t" - "vst %%v4, 64(%%r1,%[x]),3\n\t" - "vst %%v5, 80(%%r1,%[x]),3\n\t" - "vst %%v6, 96(%%r1,%[x]),3\n\t" - "vst %%v7, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 128(%%r1,%[y]),3\n\t" - "vl %%v1, 144(%%r1,%[y]),3\n\t" - "vl %%v2, 160(%%r1,%[y]),3\n\t" - "vl %%v3, 176(%%r1,%[y]),3\n\t" - "vl %%v4, 192(%%r1,%[y]),3\n\t" - "vl %%v5, 208(%%r1,%[y]),3\n\t" - "vl %%v6, 224(%%r1,%[y]),3\n\t" - "vl %%v7, 240(%%r1,%[y]),3\n\t" - "vst %%v0, 128(%%r1,%[x]),3\n\t" - "vst %%v1, 144(%%r1,%[x]),3\n\t" - "vst %%v2, 160(%%r1,%[x]),3\n\t" - "vst %%v3, 176(%%r1,%[x]),3\n\t" - "vst %%v4, 192(%%r1,%[x]),3\n\t" - "vst %%v5, 208(%%r1,%[x]),3\n\t" - "vst %%v6, 224(%%r1,%[x]),3\n\t" - "vst %%v7, 240(%%r1,%[x]),3\n\t" - "vst %%v16, 0(%%r1,%[y]),3\n\t" - "vst %%v17, 16(%%r1,%[y]),3\n\t" - "vst %%v18, 32(%%r1,%[y]),3\n\t" - "vst %%v19, 48(%%r1,%[y]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vst %%v24, 128(%%r1,%[y]),3\n\t" - "vst %%v25, 144(%%r1,%[y]),3\n\t" - "vst %%v26, 160(%%r1,%[y]),3\n\t" - "vst %%v27, 176(%%r1,%[y]),3\n\t" - "vst %%v28, 192(%%r1,%[y]),3\n\t" - "vst %%v29, 208(%%r1,%[y]),3\n\t" - "vst %%v30, 224(%%r1,%[y]),3\n\t" - "vst %%v31, 240(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) From 69edc5bbe79af88710666aa909e7b39c89558b9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Feb 2019 20:06:13 +0100 Subject: [PATCH 448/935] Restore dropped patches in the non-TLS branch of memory.c (#2004) * Restore dropped patches in the non-TLS branch of memory.c As discovered in #2002, the reintroduction of the "original" non-TLS version of memory.c as an alternate branch had inadvertently used ba1f91f rather than a8002e2 , thereby dropping the commits for #1450, #1468, #1501, #1504 and #1520. --- driver/others/memory.c | 77 ++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 72d3e173c..2e185593e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1603,9 +1603,11 @@ void gotoblas_dummy_for_PGI(void) { #endif #else +/* USE_TLS / COMPILE_TLS not set */ + #include -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) #define ALLOC_WINDOWS #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 @@ -1619,7 +1621,7 @@ void gotoblas_dummy_for_PGI(void) { #include #include -#ifndef OS_WINDOWS +#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) #include #ifndef NO_SYSV_IPC #include @@ -1639,7 +1641,7 @@ void gotoblas_dummy_for_PGI(void) { #include #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -1678,9 +1680,12 @@ void gotoblas_dummy_for_PGI(void) { #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else +#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#else +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -1740,7 +1745,8 @@ int i,n; size = CPU_ALLOC_SIZE(nums); ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; - nums = CPU_COUNT_S(size,cpusetp); + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -1756,7 +1762,7 @@ int get_num_procs(void) { return nums; } #endif - + #ifdef OS_HAIKU int get_num_procs(void) { static int nums = 0; @@ -1793,7 +1799,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -1870,7 +1876,7 @@ void openblas_fork_handler() // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. -#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) +#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) @@ -1883,7 +1889,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1891,11 +1897,11 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif - blas_goto_num = 0; + // blas_goto_num = 0; #ifndef USE_OPENMP blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -1907,7 +1913,7 @@ int blas_get_cpu_number(void){ #endif - blas_omp_num = 0; + // blas_omp_num = 0; blas_omp_num=openblas_omp_num_threads_env(); if (blas_omp_num < 0) blas_omp_num = 0; @@ -1915,7 +1921,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -2002,11 +2008,15 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif } #ifdef OS_LINUX @@ -2148,14 +2158,18 @@ static void *alloc_mmap(void *address){ #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif - LOCK_COMMAND(&alloc_lock); if (map_address != (void *)-1) { +#if defined(SMP) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if defined(SMP) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif } - UNLOCK_COMMAND(&alloc_lock); return map_address; } @@ -2554,6 +2568,11 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + +#if defined(USE_OPENMP) + if (!memory_initialized) { +#endif + LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { @@ -2589,6 +2608,9 @@ void *blas_memory_alloc(int procpos){ } UNLOCK_COMMAND(&alloc_lock); +#if defined(USE_OPENMP) + } +#endif #ifdef DEBUG printf("Alloc Start ...\n"); @@ -2603,13 +2625,17 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -// blas_lock(&memory[position].lock); - +#else + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; - +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -// blas_unlock(&memory[position].lock); +#else + blas_unlock(&memory[position].lock); +#endif } position ++; @@ -2647,7 +2673,6 @@ void *blas_memory_alloc(int procpos){ memory[position].used = 1; UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ if (!memory[position].addr) { do { @@ -2693,9 +2718,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif memory[position].addr = map_address; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); @@ -2749,8 +2778,9 @@ void blas_memory_free(void *free_area){ #endif position = 0; +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); - +#endif while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; @@ -2764,7 +2794,9 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -2779,8 +2811,9 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); - +#endif return; } From 03a2bf2602714360fdf7096a4fc362ecfc700823 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Feb 2019 23:24:45 +0100 Subject: [PATCH 449/935] Fix potential memory leak in cpu enumeration on Linux (#2008) * Fix potential memory leak in cpu enumeration with glibc An early return after a failed call to sched_getaffinity would leak the previously allocated cpu_set_t. Wrong calculation of the size argument in that call increased the likelyhood of that failure. Fixes #2003 --- driver/others/memory.c | 123 ++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 38 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 2e185593e..09851f15c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -198,45 +198,68 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; + if (nums >= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } @@ -1709,46 +1732,70 @@ void goto_set_num_threads(int num_threads) {}; int get_num_procs(void); #else int get_num_procs(void) { + static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; + if (nums >= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } From 77fe70019f0fb4064eec2a5b26a6057acef29b58 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 11 Feb 2019 16:01:13 +0200 Subject: [PATCH 450/935] [ZARCH] Fix constraints and source code formatting --- kernel/zarch/camax.c | 212 +++++------ kernel/zarch/camin.c | 212 +++++------ kernel/zarch/casum.c | 154 ++++---- kernel/zarch/caxpy.c | 130 +++---- kernel/zarch/ccopy.c | 21 +- kernel/zarch/cdot.c | 148 ++++---- kernel/zarch/cgemv_n_4.c | 590 ++++++++++++++--------------- kernel/zarch/cgemv_t_4.c | 52 +-- kernel/zarch/crot.c | 291 +++++++-------- kernel/zarch/cscal.c | 309 ++++++++-------- kernel/zarch/cswap.c | 151 ++++---- kernel/zarch/damax.c | 90 ++--- kernel/zarch/damax_z13.c | 158 ++++---- kernel/zarch/damin.c | 90 ++--- kernel/zarch/damin_z13.c | 158 ++++---- kernel/zarch/dasum.c | 150 ++++---- kernel/zarch/daxpy.c | 152 ++++---- kernel/zarch/dcopy.c | 20 +- kernel/zarch/ddot.c | 108 +++--- kernel/zarch/dgemv_n_4.c | 624 ++++++++++++++++--------------- kernel/zarch/dgemv_t_4.c | 780 ++++++++++++++++++++------------------- kernel/zarch/dmax.c | 90 ++--- kernel/zarch/dmax_z13.c | 124 +++---- kernel/zarch/dmin.c | 90 ++--- kernel/zarch/dmin_z13.c | 124 +++---- kernel/zarch/drot.c | 291 +++++++-------- kernel/zarch/dscal.c | 102 ++--- kernel/zarch/dsdot.c | 171 ++++----- kernel/zarch/dswap.c | 151 ++++---- kernel/zarch/icamax.c | 370 +++++++++---------- kernel/zarch/icamin.c | 370 +++++++++---------- kernel/zarch/idamax.c | 264 ++++++------- kernel/zarch/idamin.c | 264 ++++++------- kernel/zarch/idmax.c | 230 ++++++------ kernel/zarch/idmin.c | 230 ++++++------ kernel/zarch/isamax.c | 352 +++++++++--------- kernel/zarch/isamin.c | 352 +++++++++--------- kernel/zarch/ismax.c | 318 ++++++++-------- kernel/zarch/ismin.c | 318 ++++++++-------- kernel/zarch/izamax.c | 256 ++++++------- kernel/zarch/izamin.c | 256 ++++++------- kernel/zarch/samax.c | 94 ++--- kernel/zarch/samin.c | 94 ++--- kernel/zarch/sasum.c | 154 ++++---- kernel/zarch/saxpy.c | 152 ++++---- kernel/zarch/scopy.c | 20 +- kernel/zarch/sdot.c | 116 +++--- kernel/zarch/sgemv_n_4.c | 584 +++++++++++++++-------------- kernel/zarch/sgemv_t_4.c | 766 +++++++++++++++++++------------------- kernel/zarch/smax.c | 94 ++--- kernel/zarch/smin.c | 94 ++--- kernel/zarch/srot.c | 291 +++++++-------- kernel/zarch/sscal.c | 102 ++--- kernel/zarch/sswap.c | 151 ++++---- kernel/zarch/zamax.c | 166 ++++----- kernel/zarch/zamax_z13.c | 184 ++++----- kernel/zarch/zamin.c | 166 ++++----- kernel/zarch/zamin_z13.c | 184 ++++----- kernel/zarch/zasum.c | 150 ++++---- kernel/zarch/zaxpy.c | 138 +++---- kernel/zarch/zcopy.c | 21 +- kernel/zarch/zdot.c | 140 +++---- kernel/zarch/zgemv_n_4.c | 414 +++++++++++---------- kernel/zarch/zgemv_t_4.c | 452 ++++++++++++----------- kernel/zarch/zrot.c | 291 +++++++-------- kernel/zarch/zscal.c | 301 +++++++-------- kernel/zarch/zswap.c | 151 ++++---- 67 files changed, 7439 insertions(+), 7354 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 40a9903e9..b10ca4752 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -34,112 +34,112 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); return amax; } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 842635afc..40945fae8 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -34,112 +34,112 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); return amin; } diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index f59e5a20b..e28f2018c 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -34,83 +34,83 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index d86342bd0..e4b484ab7 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -30,73 +30,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__( #if !defined(CONJ) - "vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" + "vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" #else - "vlef %%v0,0(%[alpha]),1\n\t" - "vlef %%v0,0(%[alpha]),3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,0(%[alpha]),0\n\t" - "vlef %%v0,0(%[alpha]),2\n\t" - "vlrepf %%v1,4(%[alpha])\n\t" + "vlef %%v0,0(%[alpha]),1\n\t" + "vlef %%v0,0(%[alpha]),3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,0(%[alpha]),0\n\t" + "vlef %%v0,0(%[alpha]),2\n\t" + "vlrepf %%v1,4(%[alpha])\n\t" #endif - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index d17bddcc8..0a5e03992 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + [n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index 64d81ae5c..d90f9c871 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -29,80 +29,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vrepg %%v26,%%v24,1\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vfasb %%v25,%%v25,%%v29\n\t" - "vfasb %%v25,%%v25,%%v31\n\t" - "vrepg %%v27,%%v25,1\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vstef %%v24,0(%[d]),0\n\t" - "vstef %%v24,4(%[d]),1\n\t" - "vstef %%v25,8(%[d]),1\n\t" - "vstef %%v25,12(%[d]),0" - : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vrepg %%v26,%%v24,1\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vfasb %%v25,%%v25,%%v29\n\t" + "vfasb %%v25,%%v25,%%v31\n\t" + "vrepg %%v27,%%v25,1\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vstef %%v24,0(%[d]),0\n\t" + "vstef %%v24,4(%[d]),1\n\t" + "vstef %%v25,8(%[d]),1\n\t" + "vstef %%v25,12(%[d]),0" + : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index db91d9063..adba05d47 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -30,323 +30,331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" - "vlrepg %%v18,16(%[x])\n\t" - "vlrepg %%v19,24(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v18,16(%[x])\n\t" + "vlrepg %%v19,24(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" #else - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vperm %%v25,%%v24,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v24,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap1])\n\t" - "vperm %%v27,%%v26,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v26,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" - "vl %%v28,0(%%r1,%[ap2])\n\t" - "vperm %%v29,%%v28,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v28,%%v1\n\t" - "vl %%v30,0(%%r1,%[ap3])\n\t" - "vperm %%v31,%%v30,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v30,%%v1\n\t" - "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vperm %%v25,%%v24,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v24,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap1])\n\t" + "vperm %%v27,%%v26,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v26,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" + "vl %%v28,0(%%r1,%[ap2])\n\t" + "vperm %%v29,%%v28,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v28,%%v1\n\t" + "vl %%v30,0(%%r1,%[ap3])\n\t" + "vperm %%v31,%%v30,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v30,%%v1\n\t" + "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" #else - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v20,0(%%r1,%[ap0])\n\t" - "vperm %%v21,%%v20,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v20,%%v1\n\t" - "vl %%v22,0(%%r1,%[ap1])\n\t" - "vperm %%v23,%%v22,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v22,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v20,0(%%r1,%[ap0])\n\t" + "vperm %%v21,%%v20,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v20,%%v1\n\t" + "vl %%v22,0(%%r1,%[ap1])\n\t" + "vperm %%v23,%%v22,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v22,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__("vlrepg %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" #else - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v18,0(%%r1,%[ap])\n\t" - "vperm %%v19,%%v18,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v18,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v18,0(%%r1,%[ap])\n\t" + "vperm %%v19,%%v18,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v18,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) { __asm__( #if !defined(XCONJ) - "vlrepf %%v0,%[alpha_r]\n\t" - "vlef %%v1,%[alpha_i],0\n\t" - "vlef %%v1,%[alpha_i],2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,%[alpha_i],1\n\t" - "vlef %%v1,%[alpha_i],3\n\t" + "vlrepf %%v0,%[alpha_r]\n\t" + "vlef %%v1,%[alpha_i],0\n\t" + "vlef %%v1,%[alpha_i],2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,%[alpha_i],1\n\t" + "vlef %%v1,%[alpha_i],3\n\t" #else - "vlef %%v0,%[alpha_r],1\n\t" - "vlef %%v0,%[alpha_r],3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,%[alpha_r],0\n\t" - "vlef %%v0,%[alpha_r],2\n\t" - "vlrepf %%v1,%[alpha_i]\n\t" + "vlef %%v0,%[alpha_r],1\n\t" + "vlef %%v0,%[alpha_r],3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,%[alpha_r],0\n\t" + "vlef %%v0,%[alpha_r],2\n\t" + "vlrepf %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,0(%%r1,%[dest])\n\t" - "vl %%v19,16(%%r1,%[dest])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" - "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" - "vst %%v22,0(%%r1,%[dest])\n\t" - "vst %%v23,16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), - [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,0(%%r1,%[dest])\n\t" + "vl %%v19,16(%%r1,%[dest])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" + "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" + "vst %%v22,0(%%r1,%[dest])\n\t" + "vst %%v23,16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 9e65c5fb5..91ea1c10c 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -31,6 +31,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v16\n\t" "vzero %%v17\n\t" "vzero %%v18\n\t" @@ -154,20 +159,23 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v19,%%v21,%%v23\n\t" "vst %%v22,0(%[y])\n\t" "vst %%v23,16(%[y])" - : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v16\n\t" "vzero %%v17\n\t" "vzero %%v18\n\t" @@ -263,13 +271,13 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v20,%%v16,%%v18,%%v20\n\t" "vfmasb %%v20,%%v17,%%v19,%%v20\n\t" "vst %%v20,0(%[y])" - : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); + : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, @@ -353,11 +361,11 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v0,%%v16,%%v18,%%v0\n\t" "vfmasb %%v0,%%v17,%%v19,%%v0\n\t" "vsteg %%v0,0(%[y]),0" - : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index 669d78a9d..aab155f8b 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index a2d5bf223..9fc54cf29 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -29,171 +29,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v24,%%v16,32\n\t" - "verllg %%v25,%%v17,32\n\t" - "verllg %%v26,%%v18,32\n\t" - "verllg %%v27,%%v19,32\n\t" - "verllg %%v28,%%v20,32\n\t" - "verllg %%v29,%%v21,32\n\t" - "verllg %%v30,%%v22,32\n\t" - "verllg %%v31,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v24,%%v16,32\n\t" + "verllg %%v25,%%v17,32\n\t" + "verllg %%v26,%%v18,32\n\t" + "verllg %%v27,%%v19,32\n\t" + "verllg %%v28,%%v20,32\n\t" + "verllg %%v29,%%v21,32\n\t" + "verllg %%v30,%%v22,32\n\t" + "verllg %%v31,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlef %%v0,4(%[alpha]),0\n\t" - "vlef %%v0,4(%[alpha]),2\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,4(%[alpha]),1\n\t" - "vlef %%v0,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v16,%%v16,32\n\t" - "verllg %%v17,%%v17,32\n\t" - "verllg %%v18,%%v18,32\n\t" - "verllg %%v19,%%v19,32\n\t" - "verllg %%v20,%%v20,32\n\t" - "verllg %%v21,%%v21,32\n\t" - "verllg %%v22,%%v22,32\n\t" - "verllg %%v23,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "vlef %%v0,4(%[alpha]),2\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,4(%[alpha]),1\n\t" + "vlef %%v0,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v16,%%v16,32\n\t" + "verllg %%v17,%%v17,32\n\t" + "verllg %%v18,%%v18,32\n\t" + "verllg %%v19,%%v19,32\n\t" + "verllg %%v20,%%v20,32\n\t" + "verllg %%v21,%%v21,32\n\t" + "verllg %%v22,%%v22,32\n\t" + "verllg %%v23,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 92a81591f..198994e18 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 37008f702..caacb50dc 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -34,51 +34,51 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxdb %%v16,%%v16,%%v24,8\n\t" - "vfmaxdb %%v17,%%v17,%%v25,8\n\t" - "vfmaxdb %%v18,%%v18,%%v26,8\n\t" - "vfmaxdb %%v19,%%v19,%%v27,8\n\t" - "vfmaxdb %%v20,%%v20,%%v28,8\n\t" - "vfmaxdb %%v21,%%v21,%%v29,8\n\t" - "vfmaxdb %%v22,%%v22,%%v30,8\n\t" - "vfmaxdb %%v23,%%v23,%%v31,8\n\t" - "vfmaxdb %%v16,%%v16,%%v20,8\n\t" - "vfmaxdb %%v17,%%v17,%%v21,8\n\t" - "vfmaxdb %%v18,%%v18,%%v22,8\n\t" - "vfmaxdb %%v19,%%v19,%%v23,8\n\t" - "vfmaxdb %%v16,%%v16,%%v18,8\n\t" - "vfmaxdb %%v17,%%v17,%%v19,8\n\t" - "vfmaxdb %%v16,%%v16,%%v17,8\n\t" - "vfmaxdb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,8\n\t" + "vfmaxdb %%v17,%%v17,%%v25,8\n\t" + "vfmaxdb %%v18,%%v18,%%v26,8\n\t" + "vfmaxdb %%v19,%%v19,%%v27,8\n\t" + "vfmaxdb %%v20,%%v20,%%v28,8\n\t" + "vfmaxdb %%v21,%%v21,%%v29,8\n\t" + "vfmaxdb %%v22,%%v22,%%v30,8\n\t" + "vfmaxdb %%v23,%%v23,%%v31,8\n\t" + "vfmaxdb %%v16,%%v16,%%v20,8\n\t" + "vfmaxdb %%v17,%%v17,%%v21,8\n\t" + "vfmaxdb %%v18,%%v18,%%v22,8\n\t" + "vfmaxdb %%v19,%%v19,%%v23,8\n\t" + "vfmaxdb %%v16,%%v16,%%v18,8\n\t" + "vfmaxdb %%v17,%%v17,%%v19,8\n\t" + "vfmaxdb %%v16,%%v16,%%v17,8\n\t" + "vfmaxdb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 530d6e5bb..f3db4c108 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -34,85 +34,85 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index a01791741..0163a144b 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -34,51 +34,51 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,8\n\t" - "vfmindb %%v17,%%v17,%%v25,8\n\t" - "vfmindb %%v18,%%v18,%%v26,8\n\t" - "vfmindb %%v19,%%v19,%%v27,8\n\t" - "vfmindb %%v20,%%v20,%%v28,8\n\t" - "vfmindb %%v21,%%v21,%%v29,8\n\t" - "vfmindb %%v22,%%v22,%%v30,8\n\t" - "vfmindb %%v23,%%v23,%%v31,8\n\t" - "vfmindb %%v16,%%v16,%%v20,8\n\t" - "vfmindb %%v17,%%v17,%%v21,8\n\t" - "vfmindb %%v18,%%v18,%%v22,8\n\t" - "vfmindb %%v19,%%v19,%%v23,8\n\t" - "vfmindb %%v16,%%v16,%%v18,8\n\t" - "vfmindb %%v17,%%v17,%%v19,8\n\t" - "vfmindb %%v16,%%v16,%%v17,8\n\t" - "vfmindb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,8\n\t" + "vfmindb %%v17,%%v17,%%v25,8\n\t" + "vfmindb %%v18,%%v18,%%v26,8\n\t" + "vfmindb %%v19,%%v19,%%v27,8\n\t" + "vfmindb %%v20,%%v20,%%v28,8\n\t" + "vfmindb %%v21,%%v21,%%v29,8\n\t" + "vfmindb %%v22,%%v22,%%v30,8\n\t" + "vfmindb %%v23,%%v23,%%v31,8\n\t" + "vfmindb %%v16,%%v16,%%v20,8\n\t" + "vfmindb %%v17,%%v17,%%v21,8\n\t" + "vfmindb %%v18,%%v18,%%v22,8\n\t" + "vfmindb %%v19,%%v19,%%v23,8\n\t" + "vfmindb %%v16,%%v16,%%v18,8\n\t" + "vfmindb %%v17,%%v17,%%v19,8\n\t" + "vfmindb %%v16,%%v16,%%v17,8\n\t" + "vfmindb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 2172b6d6f..4196b2e15 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -34,85 +34,85 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 9f69a9931..aa1382b10 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -34,81 +34,81 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 179ef8834..5b0208c20 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepg %%v0,%[alpha]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), - [alpha] "m"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + [alpha] "Q"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index b6a740c43..691b90c64 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f5f601717..9cad68f4b 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -31,60 +31,60 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { FLOAT dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), - [y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index c93ff9b54..502ba837e 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -31,324 +31,334 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,16(%[x])\n\t" - "vlrepg %%v3,24(%[x])\n\t" - "vlrepg %%v4,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v4\n\t" - "vfmdb %%v1,%%v1,%%v4\n\t" - "vfmdb %%v2,%%v2,%%v4\n\t" - "vfmdb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,16(%[x])\n\t" + "vlrepg %%v3,24(%[x])\n\t" + "vlrepg %%v4,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v4\n\t" + "vfmdb %%v1,%%v1,%%v4\n\t" + "vfmdb %%v2,%%v2,%%v4\n\t" + "vfmdb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v2\n\t" - "vfmdb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v2\n\t" + "vfmdb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v16,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" - "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" - "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v16,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" + "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" + "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 24680cf1b..de72a1798 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -30,333 +30,341 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v2,%%v2,%%v6\n\t" - "vfadb %%v3,%%v3,%%v7\n\t" - "vrepg %%v4,%%v0,1\n\t" - "adbr %%f0,%%f4\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v4,%%v1,1\n\t" - "adbr %%f1,%%f4\n\t" - "std %%f1,8(%[y])\n\t" - "vrepg %%v4,%%v2,1\n\t" - "adbr %%f2,%%f4\n\t" - "std %%f2,16(%[y])\n\t" - "vrepg %%v4,%%v3,1\n\t" - "adbr %%f3,%%f4\n\t" - "std %%f3,24(%[y])" - : "=m"(*(FLOAT (*)[4]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v2,%%v2,%%v6\n\t" + "vfadb %%v3,%%v3,%%v7\n\t" + "vrepg %%v4,%%v0,1\n\t" + "adbr %%f0,%%f4\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v4,%%v1,1\n\t" + "adbr %%f1,%%f4\n\t" + "std %%f1,8(%[y])\n\t" + "vrepg %%v4,%%v2,1\n\t" + "adbr %%f2,%%f4\n\t" + "std %%f2,16(%[y])\n\t" + "vrepg %%v4,%%v3,1\n\t" + "adbr %%f3,%%f4\n\t" + "std %%f3,24(%[y])" + : "=m"(*(struct { FLOAT x[4]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v1,%%v1,%%v3\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v1,%%v1,%%v7\n\t" - "vrepg %%v2,%%v0,1\n\t" - "adbr %%f0,%%f2\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v2,%%v1,1\n\t" - "adbr %%f1,%%f2\n\t" - "std %%f1,8(%[y])" - : "=m"(*(FLOAT (*)[2]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v1,%%v1,%%v3\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v1,%%v1,%%v7\n\t" + "vrepg %%v2,%%v0,1\n\t" + "adbr %%f0,%%f2\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v2,%%v1,1\n\t" + "adbr %%f1,%%f2\n\t" + "std %%f1,8(%[y])" + : "=m"(*(struct { FLOAT x[2]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "std %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "std %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -369,74 +377,74 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { __asm__("vlrepg %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) dest) - : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 65ed31f01..cdc8d5d08 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -31,51 +31,51 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxdb %%v16,%%v16,%%v24,0\n\t" - "vfmaxdb %%v17,%%v17,%%v25,0\n\t" - "vfmaxdb %%v18,%%v18,%%v26,0\n\t" - "vfmaxdb %%v19,%%v19,%%v27,0\n\t" - "vfmaxdb %%v20,%%v20,%%v28,0\n\t" - "vfmaxdb %%v21,%%v21,%%v29,0\n\t" - "vfmaxdb %%v22,%%v22,%%v30,0\n\t" - "vfmaxdb %%v23,%%v23,%%v31,0\n\t" - "vfmaxdb %%v16,%%v16,%%v20,0\n\t" - "vfmaxdb %%v17,%%v17,%%v21,0\n\t" - "vfmaxdb %%v18,%%v18,%%v22,0\n\t" - "vfmaxdb %%v19,%%v19,%%v23,0\n\t" - "vfmaxdb %%v16,%%v16,%%v18,0\n\t" - "vfmaxdb %%v17,%%v17,%%v19,0\n\t" - "vfmaxdb %%v16,%%v16,%%v17,0\n\t" - "vfmaxdb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,0\n\t" - "ldr %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v17,%%v17,%%v25,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v19,%%v19,%%v27,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v21,%%v21,%%v29,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v23,%%v23,%%v31,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v17,%%v17,%%v21,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v19,%%v19,%%v23,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v17,%%v17,%%v19,0\n\t" + "vfmaxdb %%v16,%%v16,%%v17,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 87bccbe55..c4e8d91f8 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -31,68 +31,68 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 518cc262c..f9b129cbd 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -31,51 +31,51 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v17,%%v17,%%v25,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v19,%%v19,%%v27,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v21,%%v21,%%v29,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v23,%%v23,%%v31,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v17,%%v17,%%v21,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v19,%%v19,%%v23,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v17,%%v17,%%v19,0\n\t" - "vfmindb %%v16,%%v16,%%v17,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v17,%%v17,%%v25,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v19,%%v19,%%v27,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v21,%%v21,%%v29,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v23,%%v23,%%v31,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v17,%%v17,%%v21,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v19,%%v19,%%v23,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v17,%%v17,%%v19,0\n\t" + "vfmindb %%v16,%%v16,%%v17,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 91561992f..77f021c1d 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -31,68 +31,68 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 8f0197f02..11fbe15b6 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index c944990b5..2961eff20 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { __asm__("vlrepg %%v0,%[da]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x),[da] "m"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmdb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmdb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmdb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmdb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmdb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmdb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmdb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmdb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x),[da] "Q"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 1ac02d4b9..5fa88c3b9 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -31,91 +31,92 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { double dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vlef %%v16,0(%%r1,%[x]),0\n\t" - "vlef %%v16,4(%%r1,%[x]),2\n\t" - "vlef %%v17,8(%%r1,%[x]),0\n\t" - "vlef %%v17,12(%%r1,%[x]),2\n\t" - "vlef %%v18,16(%%r1,%[x]),0\n\t" - "vlef %%v18,20(%%r1,%[x]),2\n\t" - "vlef %%v19,24(%%r1,%[x]),0\n\t" - "vlef %%v19,28(%%r1,%[x]),2\n\t" - "vlef %%v20,32(%%r1,%[x]),0\n\t" - "vlef %%v20,36(%%r1,%[x]),2\n\t" - "vlef %%v21,40(%%r1,%[x]),0\n\t" - "vlef %%v21,44(%%r1,%[x]),2\n\t" - "vlef %%v22,48(%%r1,%[x]),0\n\t" - "vlef %%v22,52(%%r1,%[x]),2\n\t" - "vlef %%v23,56(%%r1,%[x]),0\n\t" - "vlef %%v23,60(%%r1,%[x]),2\n\t" - "vflls %%v16,%%v16\n\t" - "vflls %%v17,%%v17\n\t" - "vflls %%v18,%%v18\n\t" - "vflls %%v19,%%v19\n\t" - "vflls %%v20,%%v20\n\t" - "vflls %%v21,%%v21\n\t" - "vflls %%v22,%%v22\n\t" - "vflls %%v23,%%v23\n\t" - "vlef %%v24,0(%%r1,%[y]),0\n\t" - "vlef %%v24,4(%%r1,%[y]),2\n\t" - "vflls %%v24,%%v24\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vlef %%v25,8(%%r1,%[y]),0\n\t" - "vlef %%v25,12(%%r1,%[y]),2\n\t" - "vflls %%v25,%%v25\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vlef %%v26,16(%%r1,%[y]),0\n\t" - "vlef %%v26,20(%%r1,%[y]),2\n\t" - "vflls %%v26,%%v26\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vlef %%v27,24(%%r1,%[y]),0\n\t" - "vlef %%v27,28(%%r1,%[y]),2\n\t" - "vflls %%v27,%%v27\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vlef %%v28,32(%%r1,%[y]),0\n\t" - "vlef %%v28,36(%%r1,%[y]),2\n\t" - "vflls %%v28,%%v28\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vlef %%v29,40(%%r1,%[y]),0\n\t" - "vlef %%v29,44(%%r1,%[y]),2\n\t" - "vflls %%v29,%%v29\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vlef %%v30,48(%%r1,%[y]),0\n\t" - "vlef %%v30,52(%%r1,%[y]),2\n\t" - "vflls %%v30,%%v30\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vlef %%v31,56(%%r1,%[y]),0\n\t" - "vlef %%v31,60(%%r1,%[y]),2\n\t" - "vflls %%v31,%%v31\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vlef %%v16,0(%%r1,%[x]),0\n\t" + "vlef %%v16,4(%%r1,%[x]),2\n\t" + "vlef %%v17,8(%%r1,%[x]),0\n\t" + "vlef %%v17,12(%%r1,%[x]),2\n\t" + "vlef %%v18,16(%%r1,%[x]),0\n\t" + "vlef %%v18,20(%%r1,%[x]),2\n\t" + "vlef %%v19,24(%%r1,%[x]),0\n\t" + "vlef %%v19,28(%%r1,%[x]),2\n\t" + "vlef %%v20,32(%%r1,%[x]),0\n\t" + "vlef %%v20,36(%%r1,%[x]),2\n\t" + "vlef %%v21,40(%%r1,%[x]),0\n\t" + "vlef %%v21,44(%%r1,%[x]),2\n\t" + "vlef %%v22,48(%%r1,%[x]),0\n\t" + "vlef %%v22,52(%%r1,%[x]),2\n\t" + "vlef %%v23,56(%%r1,%[x]),0\n\t" + "vlef %%v23,60(%%r1,%[x]),2\n\t" + "vflls %%v16,%%v16\n\t" + "vflls %%v17,%%v17\n\t" + "vflls %%v18,%%v18\n\t" + "vflls %%v19,%%v19\n\t" + "vflls %%v20,%%v20\n\t" + "vflls %%v21,%%v21\n\t" + "vflls %%v22,%%v22\n\t" + "vflls %%v23,%%v23\n\t" + "vlef %%v24,0(%%r1,%[y]),0\n\t" + "vlef %%v24,4(%%r1,%[y]),2\n\t" + "vflls %%v24,%%v24\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vlef %%v25,8(%%r1,%[y]),0\n\t" + "vlef %%v25,12(%%r1,%[y]),2\n\t" + "vflls %%v25,%%v25\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vlef %%v26,16(%%r1,%[y]),0\n\t" + "vlef %%v26,20(%%r1,%[y]),2\n\t" + "vflls %%v26,%%v26\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vlef %%v27,24(%%r1,%[y]),0\n\t" + "vlef %%v27,28(%%r1,%[y]),2\n\t" + "vflls %%v27,%%v27\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vlef %%v28,32(%%r1,%[y]),0\n\t" + "vlef %%v28,36(%%r1,%[y]),2\n\t" + "vflls %%v28,%%v28\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vlef %%v29,40(%%r1,%[y]),0\n\t" + "vlef %%v29,44(%%r1,%[y]),2\n\t" + "vflls %%v29,%%v29\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vlef %%v30,48(%%r1,%[y]),0\n\t" + "vlef %%v30,52(%%r1,%[y]),2\n\t" + "vflls %%v30,%%v30\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vlef %%v31,56(%%r1,%[y]),0\n\t" + "vlef %%v31,60(%%r1,%[y]),2\n\t" + "vflls %%v31,%%v31\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 60ba40bd6..f0c9ded51 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 1e1040a6e..a2546b812 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -34,191 +34,191 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index d1c0e32a1..09654b742 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -34,191 +34,191 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 8434c811f..b292c1d15 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -34,138 +34,138 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 80a37e6c2..f9a8119e1 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -34,138 +34,138 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 18cdba437..8f283bc17 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -31,121 +31,121 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imax; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 02ca427e4..e4b7bb4fe 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -31,121 +31,121 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imin; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index bbb4012aa..ac86435d7 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -34,182 +34,182 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index e8b34b934..3f2d039eb 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -34,182 +34,182 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index a565df503..41172c1bd 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -31,165 +31,165 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imax; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index ff72b2c64..e2684df41 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -31,165 +31,165 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imin; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 48afb8215..daca1d6f7 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -34,134 +34,134 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); return iamax; } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 3edbe3d58..9ababb91f 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -34,134 +34,134 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); return iamin; } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index efbc0318c..fdda6dd32 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -34,53 +34,53 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,8\n\t" - "vfmaxsb %%v17,%%v17,%%v25,8\n\t" - "vfmaxsb %%v18,%%v18,%%v26,8\n\t" - "vfmaxsb %%v19,%%v19,%%v27,8\n\t" - "vfmaxsb %%v20,%%v20,%%v28,8\n\t" - "vfmaxsb %%v21,%%v21,%%v29,8\n\t" - "vfmaxsb %%v22,%%v22,%%v30,8\n\t" - "vfmaxsb %%v23,%%v23,%%v31,8\n\t" - "vfmaxsb %%v16,%%v16,%%v20,8\n\t" - "vfmaxsb %%v17,%%v17,%%v21,8\n\t" - "vfmaxsb %%v18,%%v18,%%v22,8\n\t" - "vfmaxsb %%v19,%%v19,%%v23,8\n\t" - "vfmaxsb %%v16,%%v16,%%v18,8\n\t" - "vfmaxsb %%v17,%%v17,%%v19,8\n\t" - "vfmaxsb %%v16,%%v16,%%v17,8\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,8\n\t" + "vfmaxsb %%v17,%%v17,%%v25,8\n\t" + "vfmaxsb %%v18,%%v18,%%v26,8\n\t" + "vfmaxsb %%v19,%%v19,%%v27,8\n\t" + "vfmaxsb %%v20,%%v20,%%v28,8\n\t" + "vfmaxsb %%v21,%%v21,%%v29,8\n\t" + "vfmaxsb %%v22,%%v22,%%v30,8\n\t" + "vfmaxsb %%v23,%%v23,%%v31,8\n\t" + "vfmaxsb %%v16,%%v16,%%v20,8\n\t" + "vfmaxsb %%v17,%%v17,%%v21,8\n\t" + "vfmaxsb %%v18,%%v18,%%v22,8\n\t" + "vfmaxsb %%v19,%%v19,%%v23,8\n\t" + "vfmaxsb %%v16,%%v16,%%v18,8\n\t" + "vfmaxsb %%v17,%%v17,%%v19,8\n\t" + "vfmaxsb %%v16,%%v16,%%v17,8\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 138836ce5..f05e851f9 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -34,53 +34,53 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,8\n\t" - "vfminsb %%v17,%%v17,%%v25,8\n\t" - "vfminsb %%v18,%%v18,%%v26,8\n\t" - "vfminsb %%v19,%%v19,%%v27,8\n\t" - "vfminsb %%v20,%%v20,%%v28,8\n\t" - "vfminsb %%v21,%%v21,%%v29,8\n\t" - "vfminsb %%v22,%%v22,%%v30,8\n\t" - "vfminsb %%v23,%%v23,%%v31,8\n\t" - "vfminsb %%v16,%%v16,%%v20,8\n\t" - "vfminsb %%v17,%%v17,%%v21,8\n\t" - "vfminsb %%v18,%%v18,%%v22,8\n\t" - "vfminsb %%v19,%%v19,%%v23,8\n\t" - "vfminsb %%v16,%%v16,%%v18,8\n\t" - "vfminsb %%v17,%%v17,%%v19,8\n\t" - "vfminsb %%v16,%%v16,%%v17,8\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,8\n\t" + "vfminsb %%v17,%%v17,%%v25,8\n\t" + "vfminsb %%v18,%%v18,%%v26,8\n\t" + "vfminsb %%v19,%%v19,%%v27,8\n\t" + "vfminsb %%v20,%%v20,%%v28,8\n\t" + "vfminsb %%v21,%%v21,%%v29,8\n\t" + "vfminsb %%v22,%%v22,%%v30,8\n\t" + "vfminsb %%v23,%%v23,%%v31,8\n\t" + "vfminsb %%v16,%%v16,%%v20,8\n\t" + "vfminsb %%v17,%%v17,%%v21,8\n\t" + "vfminsb %%v18,%%v18,%%v22,8\n\t" + "vfminsb %%v19,%%v19,%%v23,8\n\t" + "vfminsb %%v16,%%v16,%%v18,8\n\t" + "vfminsb %%v17,%%v17,%%v19,8\n\t" + "vfminsb %%v16,%%v16,%%v17,8\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index 0c3057a92..d56f2697b 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -34,83 +34,83 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index e41e87af0..ca34a47ff 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepf %%v0,%[alpha]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), - [alpha] "m"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + [alpha] "Q"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 4e4993737..5c453cfbb 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],6\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index f659b0c8a..d870b30f0 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -31,64 +31,64 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { FLOAT dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "vrepf %%v1,%%v0,1\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepf %%v3,%%v0,3\n\t" - "aebr %%f0,%%f1\n\t" - "aebr %%f0,%%f2\n\t" - "aebr %%f0,%%f3\n\t" - "ler %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), - [y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "vrepf %%v1,%%v0,1\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepf %%v3,%%v0,3\n\t" + "aebr %%f0,%%f1\n\t" + "aebr %%f0,%%f2\n\t" + "aebr %%f0,%%f3\n\t" + "ler %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 86ac24993..a1efef373 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -31,304 +31,314 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,8(%[x])\n\t" - "vlrepf %%v3,12(%[x])\n\t" - "vlrepf %%v4,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v4\n\t" - "vfmsb %%v1,%%v1,%%v4\n\t" - "vfmsb %%v2,%%v2,%%v4\n\t" - "vfmsb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,8(%[x])\n\t" + "vlrepf %%v3,12(%[x])\n\t" + "vlrepf %%v4,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v4\n\t" + "vfmsb %%v1,%%v1,%%v4\n\t" + "vfmsb %%v2,%%v2,%%v4\n\t" + "vfmsb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v2\n\t" - "vfmsb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v2\n\t" + "vfmsb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v16,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,0(%%r1,%[y])\n\t" - "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" - "vst %%v17,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v16,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,0(%%r1,%[y])\n\t" + "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" + "vst %%v17,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 6ae9b6d7f..81d7c9fe7 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -30,330 +30,338 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v2,%%v2,%%v6\n\t" - "vfasb %%v3,%%v3,%%v7\n\t" - "veslg %%v4,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vrepg %%v4,%%v0,1\n\t" - "aebr %%f0,%%f4\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v4,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v4\n\t" - "vrepg %%v4,%%v1,1\n\t" - "aebr %%f1,%%f4\n\t" - "ste %%f1,4(%[y])\n\t" - "veslg %%v4,%%v2,32\n\t" - "vfasb %%v2,%%v2,%%v4\n\t" - "vrepg %%v4,%%v2,1\n\t" - "aebr %%f2,%%f4\n\t" - "ste %%f2,8(%[y])\n\t" - "veslg %%v4,%%v3,32\n\t" - "vfasb %%v3,%%v3,%%v4\n\t" - "vrepg %%v4,%%v3,1\n\t" - "aebr %%f3,%%f4\n\t" - "ste %%f3,12(%[y])" - : "=m"(*(FLOAT (*)[4]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v2,%%v2,%%v6\n\t" + "vfasb %%v3,%%v3,%%v7\n\t" + "veslg %%v4,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vrepg %%v4,%%v0,1\n\t" + "aebr %%f0,%%f4\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v4,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v4\n\t" + "vrepg %%v4,%%v1,1\n\t" + "aebr %%f1,%%f4\n\t" + "ste %%f1,4(%[y])\n\t" + "veslg %%v4,%%v2,32\n\t" + "vfasb %%v2,%%v2,%%v4\n\t" + "vrepg %%v4,%%v2,1\n\t" + "aebr %%f2,%%f4\n\t" + "ste %%f2,8(%[y])\n\t" + "veslg %%v4,%%v3,32\n\t" + "vfasb %%v3,%%v3,%%v4\n\t" + "vrepg %%v4,%%v3,1\n\t" + "aebr %%f3,%%f4\n\t" + "ste %%f3,12(%[y])" + : "=m"(*(struct { FLOAT x[4]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v1,%%v1,%%v3\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v1,%%v1,%%v7\n\t" - "veslg %%v2,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vrepg %%v2,%%v0,1\n\t" - "aebr %%f0,%%f2\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v2,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v2\n\t" - "vrepg %%v2,%%v1,1\n\t" - "aebr %%f1,%%f2\n\t" - "ste %%f1,4(%[y])" - : "=m"(*(FLOAT (*)[2]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v1,%%v1,%%v3\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v1,%%v1,%%v7\n\t" + "veslg %%v2,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vrepg %%v2,%%v0,1\n\t" + "aebr %%f0,%%f2\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v2,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v2\n\t" + "vrepg %%v2,%%v1,1\n\t" + "aebr %%f1,%%f2\n\t" + "ste %%f1,4(%[y])" + : "=m"(*(struct { FLOAT x[2]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "veslg %%v1,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vrepg %%v1,%%v0,1\n\t" - "aebr %%f0,%%f1\n\t" - "ste %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "veslg %%v1,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vrepg %%v1,%%v0,1\n\t" + "aebr %%f0,%%f1\n\t" + "ste %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -366,70 +374,70 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { __asm__("vlrepf %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) dest) - : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index df3c9cb4d..7015aaa1d 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -31,53 +31,53 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v17,%%v17,%%v25,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v19,%%v19,%%v27,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v21,%%v21,%%v29,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v23,%%v23,%%v31,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v17,%%v17,%%v21,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v19,%%v19,%%v23,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v17,%%v17,%%v19,0\n\t" - "vfmaxsb %%v16,%%v16,%%v17,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v17,%%v17,%%v25,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v19,%%v19,%%v27,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v21,%%v21,%%v29,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v23,%%v23,%%v31,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v17,%%v17,%%v21,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v19,%%v19,%%v23,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v17,%%v17,%%v19,0\n\t" + "vfmaxsb %%v16,%%v16,%%v17,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index 2e9c793c4..b6875c5c6 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -31,53 +31,53 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v17,%%v17,%%v25,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v19,%%v19,%%v27,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v21,%%v21,%%v29,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v23,%%v23,%%v31,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v17,%%v17,%%v21,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v19,%%v19,%%v23,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v17,%%v17,%%v19,0\n\t" - "vfminsb %%v16,%%v16,%%v17,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v17,%%v17,%%v25,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v19,%%v19,%%v27,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v21,%%v21,%%v29,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v23,%%v23,%%v31,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v17,%%v17,%%v21,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v19,%%v19,%%v23,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v17,%%v17,%%v19,0\n\t" + "vfminsb %%v16,%%v16,%%v17,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 5b21a19dc..4f471d866 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index 07e6845c6..9b9930dc8 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { __asm__("vlrepf %%v0,%[da]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmsb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmsb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmsb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmsb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmsb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmsb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmsb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmsb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x),[da] "m"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmsb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmsb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmsb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmsb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmsb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmsb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmsb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmsb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x),[da] "Q"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index dc7113143..0c62f189d 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 531e47a0b..aa04ab91f 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -34,89 +34,89 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmaxdb %%v16,%%v16,%%v24,0\n\t" - "vfmaxdb %%v18,%%v18,%%v26,0\n\t" - "vfmaxdb %%v20,%%v20,%%v28,0\n\t" - "vfmaxdb %%v22,%%v22,%%v30,0\n\t" - "vfmaxdb %%v16,%%v16,%%v20,0\n\t" - "vfmaxdb %%v18,%%v18,%%v22,0\n\t" - "vfmaxdb %%v16,%%v16,%%v18,0\n\t" - "vfmaxdb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index cac2da938..37278d6db 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -34,98 +34,98 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); return amax; } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 940d81dd2..0b5402853 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -34,89 +34,89 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index 7417e0b74..e37bb2236 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -34,98 +34,98 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); return amin; } diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 43ae8ff8b..aeef8d77e 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -34,81 +34,81 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 31549849d..9363ec32d 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -30,77 +30,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__( #if !defined(CONJ) - "vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" + "vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" #else - "vleg %%v0,0(%[alpha]),1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,0(%[alpha]),0\n\t" - "vlrepg %%v1,8(%[alpha])\n\t" + "vleg %%v0,0(%[alpha]),1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,0(%[alpha]),0\n\t" + "vlrepg %%v1,8(%[alpha])\n\t" #endif - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "vpdi %%v24,%%v8,%%v8,4\n\t" - "vpdi %%v25,%%v9,%%v9,4\n\t" - "vpdi %%v26,%%v10,%%v10,4\n\t" - "vpdi %%v27,%%v11,%%v11,4\n\t" - "vpdi %%v28,%%v16,%%v16,4\n\t" - "vpdi %%v29,%%v17,%%v17,4\n\t" - "vpdi %%v30,%%v18,%%v18,4\n\t" - "vpdi %%v31,%%v19,%%v19,4\n\t" - "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vpdi %%v24,%%v8,%%v8,4\n\t" + "vpdi %%v25,%%v9,%%v9,4\n\t" + "vpdi %%v26,%%v10,%%v10,4\n\t" + "vpdi %%v27,%%v11,%%v11,4\n\t" + "vpdi %%v28,%%v16,%%v16,4\n\t" + "vpdi %%v29,%%v17,%%v17,4\n\t" + "vpdi %%v30,%%v18,%%v18,4\n\t" + "vpdi %%v31,%%v19,%%v19,4\n\t" + "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 50ff18646..5a46aec1c 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],4\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + [n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 7a67ef734..ac6e69c23 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -29,76 +29,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v25,%%v25,%%v27\n\t" - "vfadb %%v25,%%v25,%%v29\n\t" - "vfadb %%v25,%%v25,%%v31\n\t" - "vsteg %%v24,0(%[d]),0\n\t" - "vsteg %%v24,8(%[d]),1\n\t" - "vsteg %%v25,16(%[d]),1\n\t" - "vsteg %%v25,24(%[d]),0" - : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v25,%%v25,%%v27\n\t" + "vfadb %%v25,%%v25,%%v29\n\t" + "vfadb %%v25,%%v25,%%v31\n\t" + "vsteg %%v24,0(%[d]),0\n\t" + "vsteg %%v24,8(%[d]),1\n\t" + "vsteg %%v25,16(%[d]),1\n\t" + "vsteg %%v25,24(%[d]),0" + : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 7f21985ec..5ca8da3c1 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,235 +30,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%[x]),0\n\t" - "wflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[x]),1\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "wflcdb %%v22,%%v22\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vleg %%v23,56(%[x]),0\n\t" - "wflcdb %%v23,%%v23\n\t" - "vleg %%v23,48(%[x]),1\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "wflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "wflcdb %%v22,%%v22\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vleg %%v23,56(%[x]),0\n\t" + "wflcdb %%v23,%%v23\n\t" + "vleg %%v23,48(%[x]),1\n\t" #else - "vleg %%v20,0(%[x]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,8(%[x]),0\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vflcdb %%v21,%%v21\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vflcdb %%v22,%%v22\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "vleg %%v23,48(%[x]),1\n\t" - "vflcdb %%v23,%%v23\n\t" - "vleg %%v23,56(%[x]),0\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vflcdb %%v21,%%v21\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vflcdb %%v22,%%v22\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "vleg %%v23,48(%[x]),1\n\t" + "vflcdb %%v23,%%v23\n\t" + "vleg %%v23,56(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap0])\n\t" - "vlrepg %%v29,24(%%r1,%[ap0])\n\t" - "vlrepg %%v30,16(%%r1,%[ap1])\n\t" - "vlrepg %%v31,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" - "vlrepg %%v24,0(%%r1,%[ap2])\n\t" - "vlrepg %%v25,8(%%r1,%[ap2])\n\t" - "vlrepg %%v26,0(%%r1,%[ap3])\n\t" - "vlrepg %%v27,8(%%r1,%[ap3])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap0])\n\t" + "vlrepg %%v29,24(%%r1,%[ap0])\n\t" + "vlrepg %%v30,16(%%r1,%[ap1])\n\t" + "vlrepg %%v31,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" + "vlrepg %%v24,0(%%r1,%[ap2])\n\t" + "vlrepg %%v25,8(%%r1,%[ap2])\n\t" + "vlrepg %%v26,0(%%r1,%[ap3])\n\t" + "vlrepg %%v27,8(%%r1,%[ap3])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%[x]),0\n\t" - "wflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[x]),1\n\t" - "vleg %%v19,24(%[x]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,16(%[x]),1\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "wflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vleg %%v19,24(%[x]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,16(%[x]),1\n\t" #else - "vleg %%v18,0(%[x]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,8(%[x]),0\n\t" - "vleg %%v19,16(%[x]),1\n\t" - "vflcdb %%v19,%%v19\n\t" - "vleg %%v19,24(%[x]),0\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "vleg %%v19,16(%[x]),1\n\t" + "vflcdb %%v19,%%v19\n\t" + "vleg %%v19,24(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27"); } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%[x]),0\n\t" - "wflcdb %%v17,%%v17\n\t" - "vleg %%v17,0(%[x]),1\n\t" + "vleg %%v17,8(%[x]),0\n\t" + "wflcdb %%v17,%%v17\n\t" + "vleg %%v17,0(%[x]),1\n\t" #else - "vleg %%v17,0(%[x]),1\n\t" - "vflcdb %%v17,%%v17\n\t" - "vleg %%v17,8(%[x]),0\n\t" + "vleg %%v17,0(%[x]),1\n\t" + "vflcdb %%v17,%%v17\n\t" + "vleg %%v17,8(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vlrepg %%v20,16(%%r1,%[ap])\n\t" - "vlrepg %%v21,24(%%r1,%[ap])\n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vlrepg %%v20,16(%%r1,%[ap])\n\t" + "vlrepg %%v21,24(%%r1,%[ap])\n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) { __asm__( #if !defined(XCONJ) - "vlrepg %%v0,%[alpha_r]\n\t" - "vleg %%v1,%[alpha_i],0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,%[alpha_i],1\n\t" + "vlrepg %%v0,%[alpha_r]\n\t" + "vleg %%v1,%[alpha_i],0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,%[alpha_i],1\n\t" #else - "vleg %%v0,%[alpha_r],1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,%[alpha_r],0\n\t" - "vlrepg %%v1,%[alpha_i]\n\t" + "vleg %%v0,%[alpha_r],1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,%[alpha_r],0\n\t" + "vlrepg %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), - [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 7b3e6c1fc..031c31e29 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -31,266 +31,274 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "vzero %%v20\n\t" - "vzero %%v21\n\t" - "vzero %%v22\n\t" - "vzero %%v23\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "vzero %%v18\n\t" + "vzero %%v19\n\t" + "vzero %%v20\n\t" + "vzero %%v21\n\t" + "vzero %%v22\n\t" + "vzero %%v23\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,0(%%r1,%[ap2])\n\t" - "vlrepg %%v29,8(%%r1,%[ap2])\n\t" - "vlrepg %%v30,0(%%r1,%[ap3])\n\t" - "vlrepg %%v31,8(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,0(%%r1,%[ap2])\n\t" + "vlrepg %%v29,8(%%r1,%[ap2])\n\t" + "vlrepg %%v30,0(%%r1,%[ap3])\n\t" + "vlrepg %%v31,8(%%r1,%[ap3])\n\t" + "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" + "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" + "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" + "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" + "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" + "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" + "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v20\n\t" - "vfadb %%v17,%%v17,%%v21\n\t" - "vfadb %%v18,%%v18,%%v22\n\t" - "vfadb %%v19,%%v19,%%v23\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" + "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" + "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" + "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" + "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" + "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" + "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v20\n\t" + "vfadb %%v17,%%v17,%%v21\n\t" + "vfadb %%v18,%%v18,%%v22\n\t" + "vfadb %%v19,%%v19,%%v23\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" #if !defined(XCONJ) - "vlrepg %%v24,0(%[alpha])\n\t" - "vleg %%v25,8(%[alpha]),0\n\t" - "wflcdb %%v25,%%v25\n\t" - "vleg %%v25,8(%[alpha]),1\n\t" + "vlrepg %%v24,0(%[alpha])\n\t" + "vleg %%v25,8(%[alpha]),0\n\t" + "wflcdb %%v25,%%v25\n\t" + "vleg %%v25,8(%[alpha]),1\n\t" #else - "vleg %%v24,0(%[alpha]),1\n\t" - "vflcdb %%v24,%%v24\n\t" - "vleg %%v24,0(%[alpha]),0\n\t" - "vlrepg %%v25,8(%[alpha])\n\t" + "vleg %%v24,0(%[alpha]),1\n\t" + "vflcdb %%v24,%%v24\n\t" + "vleg %%v24,0(%[alpha]),0\n\t" + "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" - "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" - "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" - "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" - "vfmadb %%v27,%%v21,%%v25,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v24,%%v28\n\t" - "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" - "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" - "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" - : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vl %%v26,0(%[y])\n\t" + "vl %%v27,16(%[y])\n\t" + "vl %%v28,32(%[y])\n\t" + "vl %%v29,48(%[y])\n\t" + "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" + "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" + "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" + "vfmadb %%v27,%%v21,%%v25,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v24,%%v28\n\t" + "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" + "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" + "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" + "vst %%v26,0(%[y])\n\t" + "vst %%v27,16(%[y])\n\t" + "vst %%v28,32(%[y])\n\t" + "vst %%v29,48(%[y])" + : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "vzero %%v18\n\t" + "vzero %%v19\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" + "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" + "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" + "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v20,16(%%r1,%[ap0])\n\t" - "vlrepg %%v21,24(%%r1,%[ap0])\n\t" - "vlrepg %%v22,16(%%r1,%[ap1])\n\t" - "vlrepg %%v23,24(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v18\n\t" - "vfadb %%v17,%%v17,%%v19\n\t" - "vpdi %%v18,%%v16,%%v16,4\n\t" - "vpdi %%v19,%%v17,%%v17,4\n\t" + "vlrepg %%v20,16(%%r1,%[ap0])\n\t" + "vlrepg %%v21,24(%%r1,%[ap0])\n\t" + "vlrepg %%v22,16(%%r1,%[ap1])\n\t" + "vlrepg %%v23,24(%%r1,%[ap1])\n\t" + "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" + "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" + "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" + "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v18\n\t" + "vfadb %%v17,%%v17,%%v19\n\t" + "vpdi %%v18,%%v16,%%v16,4\n\t" + "vpdi %%v19,%%v17,%%v17,4\n\t" #if !defined(XCONJ) - "vlrepg %%v20,0(%[alpha])\n\t" - "vleg %%v21,8(%[alpha]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,8(%[alpha]),1\n\t" + "vlrepg %%v20,0(%[alpha])\n\t" + "vleg %%v21,8(%[alpha]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,8(%[alpha]),1\n\t" #else - "vleg %%v20,0(%[alpha]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[alpha]),0\n\t" - "vlrepg %%v21,8(%[alpha])\n\t" + "vleg %%v20,0(%[alpha]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[alpha]),0\n\t" + "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" - "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" - "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" - "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" - "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" - : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); + "vl %%v22,0(%[y])\n\t" + "vl %%v23,16(%[y])\n\t" + "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" + "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" + "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" + "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" + "vst %%v22,0(%[y])\n\t" + "vst %%v23,16(%[y])\n\t" + : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" + "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v18,16(%%r1,%[ap])\n\t" - "vlrepg %%v19,24(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vpdi %%v17,%%v16,%%v16,4\n\t" + "vlrepg %%v18,16(%%r1,%[ap])\n\t" + "vlrepg %%v19,24(%%r1,%[ap])\n\t" + "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" + "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vpdi %%v17,%%v16,%%v16,4\n\t" #if !defined(XCONJ) - "vlrepg %%v18,0(%[alpha])\n\t" - "vleg %%v19,8(%[alpha]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,8(%[alpha]),1\n\t" + "vlrepg %%v18,0(%[alpha])\n\t" + "vleg %%v19,8(%[alpha]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,8(%[alpha]),1\n\t" #else - "vleg %%v18,0(%[alpha]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[alpha]),0\n\t" - "vlrepg %%v19,8(%[alpha])\n\t" + "vleg %%v18,0(%[alpha]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[alpha]),0\n\t" + "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y])\n\t" - "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" - "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" - : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); + "vl %%v0,0(%[y])\n\t" + "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" + "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" + "vst %%v0,0(%[y])\n\t" + : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index aa7f16605..6284d5a47 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index fbcc0c5b9..e497a6d7b 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -29,167 +29,170 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vpdi %%v28,%%v20,%%v20,4\n\t" - "vpdi %%v29,%%v21,%%v21,4\n\t" - "vpdi %%v30,%%v22,%%v22,4\n\t" - "vpdi %%v31,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vpdi %%v28,%%v20,%%v20,4\n\t" + "vpdi %%v29,%%v21,%%v21,4\n\t" + "vpdi %%v30,%%v22,%%v22,4\n\t" + "vpdi %%v31,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vleg %%v0,8(%[alpha]),0\n\t" - "wflcdb %%v0,%%v0\n\t" - "vleg %%v0,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v16,%%v16,%%v16,4\n\t" - "vpdi %%v17,%%v17,%%v17,4\n\t" - "vpdi %%v18,%%v18,%%v18,4\n\t" - "vpdi %%v19,%%v19,%%v19,4\n\t" - "vpdi %%v20,%%v20,%%v20,4\n\t" - "vpdi %%v21,%%v21,%%v21,4\n\t" - "vpdi %%v22,%%v22,%%v22,4\n\t" - "vpdi %%v23,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "wflcdb %%v0,%%v0\n\t" + "vleg %%v0,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v16,%%v16,%%v16,4\n\t" + "vpdi %%v17,%%v17,%%v17,4\n\t" + "vpdi %%v18,%%v18,%%v18,4\n\t" + "vpdi %%v19,%%v19,%%v19,4\n\t" + "vpdi %%v20,%%v20,%%v20,4\n\t" + "vpdi %%v21,%%v21,%%v21,4\n\t" + "vpdi %%v22,%%v22,%%v22,4\n\t" + "vpdi %%v23,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0f38103be..bc466866c 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, From f5836741092ca3f9358c2a24c6056bf098b3f748 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 12 Feb 2019 13:12:28 +0200 Subject: [PATCH 451/935] [ZARCH] Fix cgemv_t_4 --- kernel/zarch/cgemv_t_4.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 91ea1c10c..e10edfab0 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -120,10 +120,10 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v20\n\t" - "vfadb %%v17,%%v17,%%v21\n\t" - "vfadb %%v18,%%v18,%%v22\n\t" - "vfadb %%v19,%%v19,%%v23\n\t" + "vfasb %%v16,%%v16,%%v20\n\t" + "vfasb %%v17,%%v17,%%v21\n\t" + "vfasb %%v18,%%v18,%%v22\n\t" + "vfasb %%v19,%%v19,%%v23\n\t" "vrepg %%v20,%%v16,1\n\t" "vrepg %%v21,%%v17,1\n\t" "vrepg %%v22,%%v18,1\n\t" @@ -244,8 +244,8 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v19,%%v23,%%v1,%%v19\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v18\n\t" - "vfadb %%v17,%%v17,%%v19\n\t" + "vfasb %%v16,%%v16,%%v18\n\t" + "vfasb %%v17,%%v17,%%v19\n\t" "vrepg %%v18,%%v16,1\n\t" "vrepg %%v19,%%v17,1\n\t" "vfasb %%v16,%%v16,%%v18\n\t" @@ -342,7 +342,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v17,%%v19,%%v1,%%v17\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" "vrepg %%v17,%%v16,1\n\t" "vfasb %%v16,%%v16,%%v17\n\t" "verllg %%v17,%%v16,32\n\t" From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 15:33:48 +0100 Subject: [PATCH 452/935] Fix declaration of input arguments in the x86_64 s/dGEMV_T and s/dGEMV_N kernels Arguments 0 and 1 need to be tagged as both input and output --- kernel/x86_64/dgemv_n_4.c | 10 +++++----- kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- kernel/x86_64/sgemv_n_4.c | 14 +++++++------- kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 6d2530e81..6d33641e9 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "jnz 1b \n\t" : + "+r" (i), // 0 + "+r" (n) // 1 : - "r" (i), // 0 - "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index a7478e3a8..ed672a757 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movsd %%xmm11,8(%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movsd %%xmm10, (%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 65305ac59..63697970f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "3: \n\t" : + "+r" (i), // 0 + "+r" (n1) // 1 : - "r" (i), // 0 - "r" (n1), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 @@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) "jnz 1b \n\t" : + "+r" (i), // 0 + "+r" (n) // 1 : - "r" (i), // 0 - "r" (n), // 1 "r" (src), // 2 "r" (dest) // 3 : "cc", diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 065e5b385..86ecaf516 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movss %%xmm11,4(%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movss %%xmm10, (%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 15:51:43 +0100 Subject: [PATCH 453/935] Fix declaration of input arguments in inline assembly Argument 0 is modified as it doubles as a counter --- kernel/x86_64/dscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index ef9a0a6ba..d0d7801fd 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ "jnz 1b \n\t" : + "+r" (n) // 0 : - "r" (n), // 0 "r" (x), // 1 "r" (x1), // 2 "r" (alpha), // 3 From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 16:00:18 +0100 Subject: [PATCH 454/935] Fix declaration of assembly arguments in SSYMV and DSYMV microkernels Arguments 0 and 1 are both input and output --- kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c index d7166fe4b..ae287b6d8 100644 --- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c @@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c index d83d20f8e..4778f644a 100644 --- a/kernel/x86_64/dsymv_U_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c @@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c index 1344c75f7..065182286 100644 --- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c index 1ef6fbafd..d84e703bd 100644 --- a/kernel/x86_64/dsymv_U_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c @@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index 8c01ab806..4a4f4d68d 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c index a32e59b44..e6a09ccf8 100644 --- a/kernel/x86_64/ssymv_U_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c @@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c index b8e6ee732..c56ff3b15 100644 --- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c @@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c index e8650650c..c4919a39a 100644 --- a/kernel/x86_64/ssymv_U_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c @@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 16:14:02 +0100 Subject: [PATCH 455/935] Fix declaration of arguments in inline assembly Argument 0 is modified so should be input and output --- kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c index d84470cc4..bfa07b6d0 100644 --- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c @@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 866782ee6..6241879d5 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c index 38479f77a..a161dcd8b 100644 --- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c index b4e6ab369..b205b1019 100644 --- a/kernel/x86_64/dsymv_L_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c @@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c index 9002228f3..602c3edf2 100644 --- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c @@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c index 69db008b6..fdfe4349a 100644 --- a/kernel/x86_64/ssymv_L_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c @@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c index c0fe5d640..6bb9c02f6 100644 --- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c index 093ca8073..0c78212e7 100644 --- a/kernel/x86_64/ssymv_L_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c @@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 @@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 From bec54ae366ebce932b6bd6bdc89d4e585a0da798 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 13 Feb 2019 12:54:35 +0200 Subject: [PATCH 456/935] [ZARCH] Fix caxpy --- kernel/zarch/caxpy.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index e4b484ab7..14a124ae2 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -65,6 +65,14 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vl %%v21,80(%%r1,%[y])\n\t" "vl %%v22,96(%%r1,%[y])\n\t" "vl %%v23,112(%%r1,%[y])\n\t" + "verllg %%v24,%%v8,32\n\t" + "verllg %%v25,%%v9,32\n\t" + "verllg %%v26,%%v10,32\n\t" + "verllg %%v27,%%v11,32\n\t" + "verllg %%v28,%%v16,32\n\t" + "verllg %%v29,%%v17,32\n\t" + "verllg %%v30,%%v18,32\n\t" + "verllg %%v31,%%v19,32\n\t" "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" From 0a54c98b9d9a6ad8364297bbef0eea4b000a92f0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 13 Feb 2019 21:06:25 +0200 Subject: [PATCH 457/935] [ZARCH] Modify constraints --- kernel/zarch/cgemv_n_4.c | 2 +- kernel/zarch/zgemv_n_4.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index adba05d47..5c36bc338 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -352,7 +352,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "brctg %[n],0b" : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 5ca8da3c1..13045a359 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -263,7 +263,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "brctg %[n],0b" : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); From f9d67bb5e8e895fd5fe7e36e43febef7aa06ef35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Feb 2019 22:06:41 +0100 Subject: [PATCH 458/935] Fix out-of-bounds memory access in gemm_beta Fixes #2011 (as suggested by davemq) presuming typo by K.Goto --- kernel/power/gemm_beta.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 62d7761ec..7acc05b4d 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -129,7 +129,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbst PRE, CO1 + dcbtst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 From 718efcec6fb6d45d5dd461ed47b26f49c2c4e77d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Feb 2019 22:08:37 +0100 Subject: [PATCH 459/935] Fix out-of-bounds memory access in gemm_beta Fixes #2011 (as suggested by davemq), assuming typo by K.Goto --- kernel/power/zgemm_beta.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 43b72ca15..1f4c29210 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -134,7 +134,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbst PRE, CO1 + dcbtst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 From b55c586faca28863db16a2148b69aaa37aaa797e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 15:21:36 +0100 Subject: [PATCH 460/935] Fix missing clobber in x86/x86_64 blas_quickdivide inline assembly function (#2017) * Fix missing clobber in blas_quickdivide assembly --- common_x86.h | 2 +- common_x86_64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common_x86.h b/common_x86.h index 4f538c948..3fdffe2a8 100644 --- a/common_x86.h +++ b/common_x86.h @@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); return result; #endif diff --git a/common_x86_64.h b/common_x86_64.h index f27c1e9be..718a81050 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); return result; } From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 14 Feb 2019 16:19:41 +0000 Subject: [PATCH 461/935] dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3 This fixes a crash in dblat2 when OpenBLAS is compiled using -march=znver1 -ftree-vectorize -O2 See also: https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 584a6c6b5..da0fa2fff 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 22:43:18 +0100 Subject: [PATCH 462/935] Save and restore input argument 8 (lda4) Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 2c90f8aa9..e89a16785 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); @@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%ymm6 \n\t" // alpha + "movq %8, %%xmm10 \n\t" //save lda + "testq $0x04, %1 \n\t" "jz 2f \n\t" @@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "4: \n\t" "vzeroupper \n\t" + "movq %%xmm10, %8 \n\t" //restore lda : "+r" (i), // 0 @@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", + "%xmm10", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO } - #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz 2f \n\t" From adb419ed67cb6b3c416a7e6babdd28390cefe37d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 22:57:30 +0100 Subject: [PATCH 463/935] With the Intel compiler on Linux, prefer ifort for the final link step icc has known problems with mixed-language builds that ifort can handle just fine. Fixes #1956 --- exports/Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 3a5f77db3..b1348bd4a 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -141,6 +141,14 @@ else $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c endif + +ifeq ($(F_COMPILER), INTEL) + $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ + -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +else + ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ @@ -152,6 +160,7 @@ else -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +endif endif rm -f linktest From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Feb 2019 10:10:04 +0100 Subject: [PATCH 464/935] Rename operands to put lda on the input/output constraint list --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ 1 file changed, 61 insertions(+), 65 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index e89a16785..93e1e26e8 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%3), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha - "movq %8, %%xmm10 \n\t" //save lda - "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" - "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" - "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" - "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "testq $0x08, %1 \n\t" "jz 3f \n\t" - "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" - "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" - "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" @@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y - - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" - "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - - "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" "addq $16, %0 \n\t" - "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" - "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" - "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" - "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" - "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" + "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" + "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" + "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" + "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $16, %8 \n\t" - "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y + "addq $16, %2 \n\t" + "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y "subq $16, %1 \n\t" - "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y "jnz 1b \n\t" "4: \n\t" "vzeroupper \n\t" - "movq %%xmm10, %8 \n\t" //restore lda : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", @@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", - "%xmm10", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Feb 2019 15:08:16 +0100 Subject: [PATCH 465/935] Fix wrong constraints in inline assembly for #2009 --- kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index fcab8e2c7..9ab78fc8e 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " cmpq $0, %0 \n\t" " je 4f \n\t" - " vmovups (%2,%1,4), %%ymm0 \n\t" // read a - " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 - " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + " vmovups (%8,%1,4), %%ymm0 \n\t" // read a + " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 + " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 " addq $8, %1 \n\t" @@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .p2align 4 \n\t" "1: \n\t" - " vmovups (%2,%1,4), %%ymm4 \n\t" // read a + " vmovups (%8,%1,4), %%ymm4 \n\t" // read a " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 + " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" - " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 + " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 22f \n\t" - " vmovups (%2,%1,4), %%ymm0 \n\t" // read a + " vmovups (%8,%1,4), %%ymm0 \n\t" // read a " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" - " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 + " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" - " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" @@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" - " vmovups (%9), %%ymm0 \n\t" + " vmovups (%3), %%ymm0 \n\t" " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" @@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" - " vmovups 32(%9), %%ymm4 \n\t" + " vmovups 32(%3), %%ymm4 \n\t" " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" @@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "5: \n\t" // i = 0 - " addq $64, %9 \n\t" // b=b+8 + " addq $64, %3 \n\t" // b=b+8 " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups %%ymm8 , (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups %%ymm8 , (%2) \n\t" // write a " vmovups %%ymm8 , (%4) \n\t" // write c " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" @@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm9 , (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm9 , (%2) \n\t" // write a " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" @@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm10, (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm10, (%2) \n\t" // write a " vmovups %%ymm10, (%4,%7,2) \n\t" // write c " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" @@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm11, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm11, (%2) \n\t" // write a " vmovups %%ymm11, (%5) \n\t" // write c " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" @@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm12, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm12, (%2) \n\t" // write a " vmovups %%ymm12, (%5,%7,1) \n\t" // write c " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" @@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm13, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm13, (%2) \n\t" // write a " vmovups %%ymm13, (%5,%7,2) \n\t" // write c " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" @@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm14, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm14, (%2) \n\t" // write a " vmovups %%ymm14, (%6) \n\t" // write c " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" - " addq $32, %8 \n\t" // a=a+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb - " vmovups %%ymm15, (%8) \n\t" // write a + " vmovups %%ymm15, (%2) \n\t" // write a " vmovups %%ymm15, (%6,%7,1) \n\t" // write c " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c3), // 5 "r" (c6), // 6 "r" (ldc), // 7 - "r" (as), // 8 - "r" (bs) // 9 + "r" (a), // 8 + "r" (b) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From f209fc7fa90a583e60ff2c667821d39ae0efbe70 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 16 Feb 2019 12:12:39 +0100 Subject: [PATCH 466/935] Update Makefile.rule add note about NUM_THREADS for package maintainers, add examples of programs that cause affinity troubles --- Makefile.rule | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index faf34c0a1..bba3d1588 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -72,10 +72,16 @@ VERSION = 0.3.6.dev # You can define the maximum number of threads. Basically it should be less # than or equal to the number of CPU threads. If you don't specify one, it's -# automatically detected by the the script. +# automatically detected by the the build system. # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to # restrict NUM_THREADS to the number of physical cores. By default, the automatic # detection includes logical CPUs, thus allowing the use of SMT. +# Users may opt at runtime to use less than NUM_THREADS threads. +# +# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS +# value (eg. 32-256) if you expect your users to use that many threads. Due to the way +# some internal structures are allocated, using a large NUM_THREADS value has a RAM +# footprint penalty, even if users reduce the actual number of threads at runtime. # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call @@ -138,6 +144,7 @@ NO_WARMUP = 1 # to the same core(s) as OpenBLAS, possibly binding all threads to a single core. # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing # else modifies affinity settings. +# Note: enabling affinity has been known to cause problems with NumPy and R NO_AFFINITY = 1 # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:24:11 +0100 Subject: [PATCH 467/935] Fix inline assembly constraints rework indices to allow marking argument lda4 as input and output. For #2009 --- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index 11a3e943b..d21232bfa 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "movss (%2), %%xmm12 \n\t" // x0 - "movss 4(%2), %%xmm13 \n\t" // x1 - "movss 8(%2), %%xmm14 \n\t" // x2 - "movss 12(%2), %%xmm15 \n\t" // x3 + "movss (%3), %%xmm12 \n\t" // x0 + "movss 4(%3), %%xmm13 \n\t" // x1 + "movss 8(%3), %%xmm14 \n\t" // x2 + "movss 12(%3), %%xmm15 \n\t" // x3 "shufps $0, %%xmm12, %%xmm12\n\t" "shufps $0, %%xmm13, %%xmm13\n\t" "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" - "movss 16(%2), %%xmm0 \n\t" // x4 - "movss 20(%2), %%xmm1 \n\t" // x5 - "movss 24(%2), %%xmm2 \n\t" // x6 - "movss 28(%2), %%xmm3 \n\t" // x7 + "movss 16(%3), %%xmm0 \n\t" // x4 + "movss 20(%3), %%xmm1 \n\t" // x5 + "movss 24(%3), %%xmm2 \n\t" // x6 + "movss 28(%3), %%xmm3 \n\t" // x7 "shufps $0, %%xmm0 , %%xmm0 \n\t" "shufps $0, %%xmm1 , %%xmm1 \n\t" "shufps $0, %%xmm2 , %%xmm2 \n\t" @@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y ".p2align 1 \n\t" - "movups (%4,%0,4), %%xmm8 \n\t" - "movups (%5,%0,4), %%xmm9 \n\t" - "movups (%6,%0,4), %%xmm10 \n\t" - "movups (%7,%0,4), %%xmm11 \n\t" + "movups (%5,%0,4), %%xmm8 \n\t" + "movups (%6,%0,4), %%xmm9 \n\t" + "movups (%7,%0,4), %%xmm10 \n\t" + "movups (%8,%0,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" @@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "movups (%4,%8,4), %%xmm8 \n\t" - "movups (%5,%8,4), %%xmm9 \n\t" - "movups (%6,%8,4), %%xmm10 \n\t" - "movups (%7,%8,4), %%xmm11 \n\t" + "movups (%5,%2,4), %%xmm8 \n\t" + "movups (%6,%2,4), %%xmm9 \n\t" + "movups (%7,%2,4), %%xmm10 \n\t" + "movups (%8,%2,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" @@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addps %%xmm5 , %%xmm4 \n\t" "addq $4 , %0 \n\t" "mulps %%xmm6 , %%xmm4 \n\t" "subq $4 , %1 \n\t" "addps %%xmm4 , %%xmm7 \n\t" - "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y + "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y "jnz 1b \n\t" : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:36:39 +0100 Subject: [PATCH 468/935] Fix inline assembly constraints rework indices to allow marking argument lda as input and output. --- kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index b35daa35b..3fc46542b 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%3), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha @@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" - "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" - "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" - "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" - "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" - "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" - "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" - "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" + "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" + "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" + "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" + "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" @@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" - "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - "addq $4, %8 \n\t" + "addq $4, %2 \n\t" "addq $4, %0 \n\t" "subq $4, %1 \n\t" @@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" - "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" - "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" - "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" - "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" + "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" + "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" + "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" @@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" - "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - "addq $8, %8 \n\t" + "addq $8, %2 \n\t" "addq $8, %0 \n\t" "subq $8, %1 \n\t" @@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" - "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" - "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" - "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%8,%0,4) \n\t" + "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%4,%8,4) \n\t" - "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" - "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" - "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" - "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" + "prefetcht0 192(%5,%2,4) \n\t" + "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" + "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" + "prefetcht0 192(%6,%2,4) \n\t" + "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" + "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" - "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" - "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" - "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" - "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "prefetcht0 192(%7,%2,4) \n\t" + "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" + "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" + "prefetcht0 192(%8,%2,4) \n\t" + "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" + "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" @@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" - "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y - "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y + "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y - "addq $16, %8 \n\t" + "addq $16, %2 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:46:17 +0100 Subject: [PATCH 469/935] Fix inline assembly constraints --- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c index 31001c7f3..bbf06c84b 100644 --- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "vbroadcastss (%2), %%xmm12 \n\t" // x0 - "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 - "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 - "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 - "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 - "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 - "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 - "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 + "vbroadcastss (%3), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 + "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 + "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 + "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 + "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 "vbroadcastss (%9), %%xmm8 \n\t" // alpha @@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" "addq $4 , %0 \n\t" - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" - "addq $4 , %8 \n\t" + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" + "addq $4 , %2 \n\t" "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" - "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" + "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" "subq $4 , %1 \n\t" - "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y "2: \n\t" @@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y + "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y "addq $8 , %0 \n\t" - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "subq $8 , %1 \n\t" @@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%6,%0,4) \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" + "prefetcht0 192(%8,%0,4) \n\t" + "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" ".align 2 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" - - "prefetcht0 192(%4,%8,4) \n\t" - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" - "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" - "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" + + "prefetcht0 192(%5,%2,4) \n\t" + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" + "prefetcht0 192(%6,%2,4) \n\t" + "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "prefetcht0 192(%7,%2,4) \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" + "prefetcht0 192(%8,%2,4) \n\t" + "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" - "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" - "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" + "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" + "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" "addq $16, %0 \n\t" - "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y - "addq $16, %8 \n\t" - "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y + "addq $16, %2 \n\t" + "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:51:09 +0100 Subject: [PATCH 470/935] Fix inline assembly constraints --- dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 dgemv_n_microk_piledriver-4.c diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c new file mode 100644 index 000000000..466931b82 --- /dev/null +++ b/dgemv_n_microk_piledriver-4.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%3), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y + + "addq $4 , %2 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $8 , %2 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" + "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y + + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + "+r" (i), // 0 + "+r" (n), // 1 + "+r" (lda4) // 2 + : + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + + "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + "+r" (i), // 0 + "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 20:06:48 +0100 Subject: [PATCH 471/935] Fix inline assembly constraints in Bulldozer TRSM kernels rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 --- kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- 5 files changed, 356 insertions(+), 356 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c index 54df5b359..35ed4cc01 100644 --- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c @@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " prefetcht0 384(%3,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " prefetcht0 384(%7,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vmovddup (%7), %%xmm1 \n\t" // read b - " vmovddup 8(%7), %%xmm0 \n\t" // read bb + " vmovddup (%3), %%xmm1 \n\t" // read b + " vmovddup 8(%3), %%xmm0 \n\t" // read bb " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $16 , %7 \n\t" // b = b - 2 - " subq $64 , %6 \n\t" // a = a - 8 + " subq $16 , %3 \n\t" // b = b - 2 + " subq $64 , %2 \n\t" // a = a - 8 - " vmovddup (%7), %%xmm0 \n\t" // read bb + " vmovddup (%3), %%xmm0 \n\t" // read bb " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c index 1b8991c6c..3cd215000 100644 --- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c @@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] + " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] + " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] + " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] + " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] + " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] + " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] + " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] + " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] + " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] + " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c index 0623dddb0..a4a62491c 100644 --- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] + " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] + " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] + " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] + " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] + " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] + " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] + " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] + " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] + " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] + " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c index 4cc557d55..c11c84cec 100644 --- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 0 - " vbroadcastss (%7), %%xmm0 \n\t" // read bb - " vbroadcastss 4(%7), %%xmm1 \n\t" // read b + " vbroadcastss (%3), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%3), %%xmm1 \n\t" // read b " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" " \n\t" // i = 1 - " addq $8 , %7 \n\t" // b = b + 2 - " addq $64 , %6 \n\t" // a = a + 16 + " addq $8 , %3 \n\t" // b = b + 2 + " addq $64 , %2 \n\t" // a = a + 16 - " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c index 73f6e8a95..326ca2976 100644 --- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c @@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vbroadcastss (%7), %%xmm1 \n\t" // read b - " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + " vbroadcastss (%3), %%xmm1 \n\t" // read b + " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $8 , %7 \n\t" // b = b - 2 - " subq $64 , %6 \n\t" // a = a - 16 + " subq $8 , %3 \n\t" // b = b - 2 + " subq $64 , %2 \n\t" // a = a - 16 - " vbroadcastss (%7), %%xmm0 \n\t" // read bb + " vbroadcastss (%3), %%xmm0 \n\t" // read bb " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From 56089991e2305ce692482186825c44c89a535518 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 16 Feb 2019 23:26:13 +0100 Subject: [PATCH 472/935] fix the the --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index bba3d1588..91f42e396 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -72,7 +72,7 @@ VERSION = 0.3.6.dev # You can define the maximum number of threads. Basically it should be less # than or equal to the number of CPU threads. If you don't specify one, it's -# automatically detected by the the build system. +# automatically detected by the build system. # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to # restrict NUM_THREADS to the number of physical cores. By default, the automatic # detection includes logical CPUs, thus allowing the use of SMT. From 78d9910236739e98a16244679bbd814f1d79ca7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 20:59:48 +0100 Subject: [PATCH 473/935] Correct range_n limiting same bug as seen in #1388, somehow missed in corresponding PR #1389 --- driver/level2/trmv_thread.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 24b881a93..00092e956 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -346,8 +346,8 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m) range_n[num_cpu] = m; - + if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + } queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -386,8 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m) range_n[num_cpu] = m; - + if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; From e29b0cfcc439b1598ba26486763b3cfa46583a9e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 21:03:30 +0100 Subject: [PATCH 474/935] Allow multithreading TRMV again revert workaround introduced for issue #1332 as the actual cause appears to be my incorrect fix from #1262 (see #1388) --- interface/trmv.c | 5 +---- interface/ztrmv.c | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/interface/trmv.c b/interface/trmv.c index 7c40ae976..2e52527a3 100644 --- a/interface/trmv.c +++ b/interface/trmv.c @@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP -/* nthreads = num_cpu_avail(2); + nthreads = num_cpu_avail(2); -FIXME trmv_thread was found to be broken, see issue 1332 */ - nthreads = 1; - if (nthreads == 1) { #endif diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 0e16632e0..4c47e9e91 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } else nthreads = 1; -/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ - nthreads = 1; - if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } From 45333d57931ddc64fb3e8a091e0616dd9528cef1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 22:16:33 +0100 Subject: [PATCH 475/935] Fix error introduced during cleanup --- driver/level2/trmv_thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 00092e956..43eeb40d2 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -347,7 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; - } + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -387,6 +387,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; From 343b301d14875a17ff4357bd98bea29d0df70741 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Feb 2019 10:27:48 +0100 Subject: [PATCH 476/935] Reduce list of kernels in the dynamic arch build to make compilation complete reliably within the 1h limit again --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 741c66291..44a616aaa 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -55,7 +55,7 @@ before_build: - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. + - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. build_script: - cmake --build . From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 24 Feb 2019 20:41:02 +0200 Subject: [PATCH 478/935] move fix to right place --- dgemv_n_microk_piledriver-4.c | 247 -------------------- kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++---- 2 files changed, 49 insertions(+), 296 deletions(-) delete mode 100644 dgemv_n_microk_piledriver-4.c diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c deleted file mode 100644 index 466931b82..000000000 --- a/dgemv_n_microk_piledriver-4.c +++ /dev/null @@ -1,247 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - - -#define HAVE_KERNEL_4x8 1 -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%3), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - - "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - - "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" - "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "addq $8 , %2 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 - : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -#define HAVE_KERNEL_4x4 1 -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - - "vbroadcastsd (%8), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y - "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y - - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - "+r" (i), // 0 - "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (alpha) // 8 - : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c index 530780bab..466931b82 100644 --- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c +++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c @@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 + "vbroadcastsd (%3), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 "vbroadcastsd (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" "addq $8 , %0 \n\t" - "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y "jnz 1b \n\t" @@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From 918a0cc4d1548617478f925c8341461c055268e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Feb 2019 17:55:36 +0100 Subject: [PATCH 479/935] Fix missing -c option in AVX512 test --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 38f9170ca..d93b756d5 100644 --- a/c_check +++ b/c_check @@ -232,7 +232,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { ($fh,$tmpf) = tempfile( UNLINK => 1 ); $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { From fd34820b99bd302ed2b31ca0e5fedeb492a179c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Feb 2019 17:58:31 +0100 Subject: [PATCH 480/935] Fix AVX512 test always returning false due to missing compiler option --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 6b602c1b0..88bb081a6 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -78,7 +78,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() From d66214c94628bb2050b2ab83361d1ac54d3373b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Feb 2019 09:58:25 +0100 Subject: [PATCH 481/935] Make x86_32 imply NO_AVX2, NO_AVX512 in addition to NO_AVX fixes #2033 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 67c8cd197..bbd777448 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,7 +155,7 @@ GETARCH_FLAGS += -DNO_AVX endif ifeq ($(BINARY), 32) -GETARCH_FLAGS += -DNO_AVX +GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 endif ifeq ($(NO_AVX2), 1) From 2ffb72718787bea52f7958d2fe5b91c489cd2aee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Feb 2019 10:51:54 +0100 Subject: [PATCH 482/935] Keep xcode8.3 for osx BINARY=32 build as xcode10 deprecated i386 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index ec5dc8a9b..eee7674fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -160,6 +160,7 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos + osx_image: xcode8.3 env: - BTYPE="BINARY=32" From c4868d11c02f1ac97e71afdef3dc49429678959b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Mar 2019 09:23:03 +0100 Subject: [PATCH 483/935] Make sure that AVX512 is disabled in 32bit builds for #2033 --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index bbd777448..53f89b2fa 100644 --- a/Makefile.system +++ b/Makefile.system @@ -156,6 +156,7 @@ endif ifeq ($(BINARY), 32) GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 +NO_AVX512 = 1 endif ifeq ($(NO_AVX2), 1) From 25427926bc8b74a48e335ae05c56cbfd8d0187b9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Mar 2019 23:36:36 +0100 Subject: [PATCH 484/935] Improve handling of NO_STATIC and NO_SHARED to avoid surprises from defining either as zero. Fixes #2035 by addressing some concerns from #1422 --- Makefile | 2 +- Makefile.install | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 21096f893..273fde33e 100644 --- a/Makefile +++ b/Makefile @@ -96,7 +96,7 @@ endif @echo shared : -ifndef NO_SHARED +ifneq ($(NO_SHARED), 1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so diff --git a/Makefile.install b/Makefile.install index 069c96c6a..fefecd98d 100644 --- a/Makefile.install +++ b/Makefile.install @@ -58,14 +58,14 @@ ifndef NO_LAPACKE endif #for install static library -ifndef NO_STATIC +ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @@ -106,14 +106,14 @@ ifndef NO_LAPACKE endif #for install static library -ifndef NO_STATIC +ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -138,7 +138,7 @@ endif @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" From e4a79be6bb9fac2ba18d820d83bc7bf9173a63c2 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 3 Mar 2019 09:05:11 +0200 Subject: [PATCH 486/935] address warning introed with #1814 et al --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 09851f15c..c30ca71cb 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2584,7 +2584,7 @@ void *blas_memory_alloc(int procpos){ int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; + int mypos = 0; #endif void *map_address; From af480b02a4a45df377acf9be0d6078609bb345c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Mar 2019 14:17:07 +0100 Subject: [PATCH 487/935] Restore locking optimizations for OpenMP case restore another accidentally dropped part of #1468 that was missed in #2004 to address performance regression reported in #1461 --- driver/others/memory.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 2e185593e..a40cb442a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2647,21 +2647,26 @@ void *blas_memory_alloc(int procpos){ position = 0; +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif do { -/* if (!memory[position].used) { */ -/* blas_lock(&memory[position].lock);*/ - +#if defined(USE_OPENMP) + if (!memory[position].used) { + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; -/* blas_unlock(&memory[position].lock);*/ -/* } */ - +#if defined(USE_OPENMP) + blas_unlock(&memory[position].lock); + } +#endif position ++; } while (position < NUM_BUFFERS); - UNLOCK_COMMAND(&alloc_lock); - +#if defined(SMP) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif goto error; allocation : @@ -2671,9 +2676,11 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; - +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); - +#else + blas_unlock(&memory[position].lock); +#endif if (!memory[position].addr) { do { #ifdef DEBUG From 783ba8058fbc6d5f0a56d27bc368b659448b1fb1 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:30:50 +0800 Subject: [PATCH 488/935] HiSilicon tsv110 CPUs optimization branch add HiSilicon tsv110 CPUs optimization branch --- kernel/arm64/KERNEL.TSV110 | 175 +++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 kernel/arm64/KERNEL.TSV110 diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 new file mode 100644 index 000000000..04d6940d7 --- /dev/null +++ b/kernel/arm64/KERNEL.TSV110 @@ -0,0 +1,175 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + From 53f482ee72e56b31ace7860199c8fb3027af5303 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:41:21 +0800 Subject: [PATCH 489/935] add TARGET support for HiSilicon tsv110 CPUs --- Makefile.arm64 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index cd16dbfae..4d10ff684 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif + +ifeq ($(CORE), TSV110) +CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +endif From 760842dda1fd8f0475216b46ca25fc016f671d05 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:45:22 +0800 Subject: [PATCH 490/935] add TARGET support for HiSilicon tsv110 CPUs --- getarch.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/getarch.c b/getarch.c index 242d08004..ac58c8226 100644 --- a/getarch.c +++ b/getarch.c @@ -1065,6 +1065,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_TSV110 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "TSV110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTSV110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "tsv110" +#define CORENAME "TSV110" +#else +#endif + + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" From fb4dae71240be9ad1e55792a46b38f8e107cb70a Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:48:49 +0800 Subject: [PATCH 491/935] add TARGET support for HiSilicon tsv110 CPUs --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 3a5a32234..aebd0dd18 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -90,6 +90,7 @@ CORTEXA73 FALKOR THUNDERX THUNDERX2T99 +TSV110 9.System Z: ZARCH_GENERIC From e4864a8933f6875bbb434887dc9120dbcf6be4dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Mar 2019 21:17:08 +0100 Subject: [PATCH 492/935] Fix module definition conflicts between LAPACK and ReLAPACK for #2043 --- CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9de894f9c..a27c1c0fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,10 +75,10 @@ endif () set(SUBDIRS ${BLASDIRS}) if (NOT NO_LAPACK) - list(APPEND SUBDIRS lapack) if(BUILD_RELAPACK) list(APPEND SUBDIRS relapack/src) endif() + list(APPEND SUBDIRS lapack) endif () # set which float types we want to build for @@ -224,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES SOVERSION ${OpenBLAS_MAJOR_VERSION} ) +if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) + if (NOT MSVC) + target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") + else() + target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") + endif() +endif() + if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") From 11cfd0bd75a1ce8714ca3abf6867d3f45548dab1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Mar 2019 16:04:25 +0100 Subject: [PATCH 493/935] Do not compile in AVX512 check if AVX support is disabled xgetbv is function depends on NO_AVX being undefined - we could change that too, but that combo is unlikely to work anyway --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 99c9254ac..46dfaea6c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -322,7 +322,7 @@ int support_avx2(){ } int support_avx512(){ -#ifndef NO_AVX512 +#if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From 4290afdae247337261b5ca0ea76e5bfcad2cc4a9 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Wed, 6 Mar 2019 20:55:06 -0800 Subject: [PATCH 494/935] ctest.c : add __POWERPC__ for PowerMac --- ctest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest.c b/ctest.c index 0571e9e02..5e869b901 100644 --- a/ctest.c +++ b/ctest.c @@ -113,7 +113,7 @@ ARCH_X86 ARCH_X86_64 #endif -#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) +#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) ARCH_POWER #endif From b7f59da42d3978234e7e6ed293365b66f340189d Mon Sep 17 00:00:00 2001 From: Celelibi Date: Thu, 7 Mar 2019 16:39:41 +0100 Subject: [PATCH 495/935] Fix crash in sgemm SSE/nano kernel on x86_64 Fix bug #2047. Signed-off-by: Celelibi --- kernel/x86_64/gemm_kernel_4x8_nano.S | 2 +- kernel/x86_64/gemm_kernel_8x4_sse.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S index 074562804..e29520fa1 100644 --- a/kernel/x86_64/gemm_kernel_4x8_nano.S +++ b/kernel/x86_64/gemm_kernel_4x8_nano.S @@ -135,7 +135,7 @@ #endif movq %rsp, %rbx # save old stack - subq $128 + LOCAL_BUFFER_SIZE, %rsp + subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S index c4ef1f809..1602c13c5 100644 --- a/kernel/x86_64/gemm_kernel_8x4_sse.S +++ b/kernel/x86_64/gemm_kernel_8x4_sse.S @@ -383,7 +383,7 @@ EMMS movq %rsp, %rbx # save old stack - subq $128 + LOCAL_BUFFER_SIZE, %rsp + subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING From b0c714ef602095c764b58c0a9ba68fddd9008c73 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Thu, 7 Mar 2019 11:36:35 -0800 Subject: [PATCH 496/935] param.h : enable defines for PPC970 on DarwinOS fixes: gemm.c: In function 'sgemm_': ../common_param.h:981:18: error: 'SGEMM_DEFAULT_P' undeclared (first use in this function) #define SGEMM_P SGEMM_DEFAULT_P ^ --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3cc400b54..48b7ef383 100644 --- a/param.h +++ b/param.h @@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_DARWIN) #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 From f7a06463d9a0db120cc530a3298f3290855ccbe9 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Thu, 7 Mar 2019 11:41:58 -0800 Subject: [PATCH 497/935] common_power.h: force DCBT_ARG 0 on PPC970 Darwin without this, we see ../kernel/power/gemv_n.S:427:Parameter syntax error and many more similar entries that relates to this assembly command dcbt 8, r24, r18 this change makes the DCBT_ARG = 0 and openblas builds through to completion on PowerMac 970 Tests pass --- common_power.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_power.h b/common_power.h index e3a1a7aef..68087b071 100644 --- a/common_power.h +++ b/common_power.h @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || ( defined(PPC970) && defined(OS_DARWIN) ) #define DCBT_ARG 0 #else #define DCBT_ARG 8 From 5b95534afcc80d54f51bd766b617fd3f494ec65a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Mar 2019 11:21:16 +0100 Subject: [PATCH 498/935] Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1 for issue #2048 --- kernel/Makefile.L3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index eafcfb1b4..bf5fffe86 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif -ifeq ($(TARGET), GENERIC) +ifeq ($(CORE), GENERIC) USE_TRMM = 1 endif From f074d7d1463c15bbf838b2305f259160281dead3 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Tue, 12 Mar 2019 16:05:19 +0800 Subject: [PATCH 499/935] make DYNAMIC_ARCH=1 package work on TSV110. --- cpuid_arm64.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5077d7b11..a5e731d74 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -39,6 +39,8 @@ // Cavium #define CPU_THUNDERX 7 #define CPU_THUNDERX2T99 8 +//Hisilicon +#define CPU_TSV110 9 static char *cpuname[] = { "UNKNOWN", @@ -49,7 +51,8 @@ static char *cpuname[] = { "CORTEXA73", "FALKOR", "THUNDERX", - "THUNDERX2T99" + "THUNDERX2T99", + "TSV110" }; static char *cpuname_lower[] = { @@ -61,7 +64,8 @@ static char *cpuname_lower[] = { "cortexa73", "falkor", "thunderx", - "thunderx2t99" + "thunderx2t99", + "tsv110" }; int get_feature(char *search) @@ -145,6 +149,9 @@ int detect(void) return CPU_THUNDERX; else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; + // HiSilicon + else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) + return CPU_TSV110; } p = (char *) NULL ; @@ -286,6 +293,21 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; + + case CPU_TSV110: + printf("#define TSV110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 4 \n"); + printf("#define L1_DATA_SIZE 65536 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 4 \n"); + printf("#define L2_SIZE 524228 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; } } From 7e3eb9b25d26ca9be337acf0b0fd2c647e353e0c Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Tue, 12 Mar 2019 16:11:01 +0800 Subject: [PATCH 500/935] make DYNAMIC_ARCH=1 package work on TSV110 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3cc400b54..79fb05380 100644 --- a/param.h +++ b/param.h @@ -2591,7 +2591,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA53) || defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) + defined(FALKOR) || defined(TSV110) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From b1393c7a97e2da1b64e1f779bdf68b7af0924543 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Mar 2019 16:03:56 +0100 Subject: [PATCH 501/935] Add Intel Denverton for #2048 --- cpuid_x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index c45ddd968..884d4b78a 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1359,6 +1359,8 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: // Apollo Lake + case 15: + // Denverton return CPUTYPE_NEHALEM; } break; @@ -1376,9 +1378,9 @@ int get_cpuname(void){ } break; case 9: - case 8: + case 8: switch (model) { - case 14: // Kaby Lake + case 14: // Kaby Lake and refreshes if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) From 04f2226ea6edd95decf888b67bbdd4a8de530b54 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Mar 2019 16:09:55 +0100 Subject: [PATCH 502/935] Add Intel Denverton --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 99c9254ac..895bacb50 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - //Apollo Lake - if (model == 12) { + //Apollo Lake or Denverton + if (model == 12 || model == 15) { return &gotoblas_NEHALEM; } return NULL; From c3e30b2bc2234dfafc9e674c8ab5723fabeb04c5 Mon Sep 17 00:00:00 2001 From: Sacha Date: Wed, 13 Mar 2019 23:21:54 +1000 Subject: [PATCH 503/935] Change 64-bit detection as explained in #2056 --- cmake/system_check.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 88bb081a6..f30a946b4 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") - set(X86_64 1) + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + set(X86_64 1) + else() + set(X86 1) + endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") From 4fc17d0d754b7905667fb84a68cf37a0d28a93bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Mar 2019 19:20:23 +0100 Subject: [PATCH 504/935] Trivial typo fix as suggested in #2022 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 91f42e396..8f72c5a79 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -199,7 +199,7 @@ NO_AFFINITY = 1 # been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 -# If you need santy check by comparing reference BLAS. It'll be very +# If you need sanity check by comparing results to reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 From e608d4f7fe1a2085b22af206d0c8c2cc128c1e9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Mar 2019 22:10:28 +0100 Subject: [PATCH 505/935] Disable the AVX512 DGEMM kernel (again) Due to as yet unresolved errors seen in #1955 and #2029 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index acc6356d6..5d0a300b5 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,7 +7,7 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c +#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c DGEMMINCOPY = dgemm_ncopy_8_skylakex.c DGEMMITCOPY = dgemm_tcopy_8_skylakex.c From 1006ff8a7bc4ee77150d6f13483838c96789e3fc Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Fri, 15 Mar 2019 15:06:30 +0100 Subject: [PATCH 506/935] Use POSIX getenv on Cygwin The Windows-native GetEnvironmentVariable cannot be relied on, as Cygwin does not always copy environment variables set through Cygwin to the Windows environment block, particularly after fork(). --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 7fcd5e316..f239c3d78 100644 --- a/common.h +++ b/common.h @@ -439,7 +439,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 typedef char env_var_t[MAX_PATH]; #define readenv(p, n) 0 #else -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #else From 4ad694eda1ff79040778648d44cda5b8f774c38d Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Mon, 18 Mar 2019 20:32:48 +0100 Subject: [PATCH 507/935] Fix for #2063: The DllMain used in Cygwin did not run the thread memory pool cleanup upon THREAD_DETACH which is needed when compiled with USE_TLS=1. --- driver/others/memory.c | 11 +++++++++-- exports/dllinit.c | 24 +++++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ed407a858..ac8545f35 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1313,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) { free(map_address); } +#ifdef SMP +void blas_thread_memory_cleanup(void) { + blas_memory_cleanup((void*)get_memory_table()); +} +#endif + + void blas_shutdown(void){ #ifdef SMP BLASFUNC(blas_thread_shutdown)(); @@ -1322,7 +1329,7 @@ void blas_shutdown(void){ /* Only cleanupIf we were built for threading and TLS was initialized */ if (local_storage_key) #endif - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1552,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser break; case DLL_THREAD_DETACH: #if defined(SMP) - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #endif break; case DLL_PROCESS_DETACH: diff --git a/exports/dllinit.c b/exports/dllinit.c index 02ff092e9..0e1bb34e3 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -40,15 +40,25 @@ void gotoblas_init(void); void gotoblas_quit(void); +#if defined(SMP) && defined(USE_TLS) +void blas_thread_memory_cleanup(void); +#endif BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { - - if (reason == DLL_PROCESS_ATTACH) { - gotoblas_init(); - } - - if (reason == DLL_PROCESS_DETACH) { - gotoblas_quit(); + switch(reason) { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: +#if defined(SMP) && defined(USE_TLS) + blas_thread_memory_cleanup(void); +#endif + break; } return TRUE; From 8ba9e2a61a1cf34e9b2efc5af61f5ebaaf6ab902 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Tue, 19 Mar 2019 10:22:02 +0100 Subject: [PATCH 508/935] Also call CloseHandle on each thread, as well as on the event so as to not leak thread handles. --- driver/others/blas_server_win32.c | 5 +++++ exports/dllinit.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bae344c59..0b38ee365 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ + // Could also just use WaitForMultipleObjects WaitForSingleObject(blas_threads[i], 5); //INFINITE); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP TerminateThread(blas_threads[i],0); #endif + CloseHandle(blas_threads[i]); } + CloseHandle(pool.filled); + CloseHandle(pool.killed); + blas_server_avail = 0; } diff --git a/exports/dllinit.c b/exports/dllinit.c index 0e1bb34e3..4a05c0e14 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -56,7 +56,7 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { break; case DLL_THREAD_DETACH: #if defined(SMP) && defined(USE_TLS) - blas_thread_memory_cleanup(void); + blas_thread_memory_cleanup(); #endif break; } From b043a5962e3785c9879f671fca8e7226dc70ff4f Mon Sep 17 00:00:00 2001 From: Ayappan P Date: Mon, 25 Mar 2019 18:53:25 +0530 Subject: [PATCH 509/935] AIX asm syntax changes needed for shared object creation --- common_power.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/common_power.h b/common_power.h index 68087b071..60de48a63 100644 --- a/common_power.h +++ b/common_power.h @@ -598,9 +598,14 @@ REALNAME:;\ #ifndef __64BIT__ #define PROLOGUE \ .machine "any";\ + .toc;\ .globl .REALNAME;\ + .globl REALNAME;\ + .csect REALNAME[DS],3;\ +REALNAME:;\ + .long .REALNAME, TOC[tc0], 0;\ .csect .text[PR],5;\ -.REALNAME:; +.REALNAME: #define EPILOGUE \ _section_.text:;\ @@ -611,9 +616,14 @@ _section_.text:;\ #define PROLOGUE \ .machine "any";\ + .toc;\ .globl .REALNAME;\ + .globl REALNAME;\ + .csect REALNAME[DS],3;\ +REALNAME:;\ + .llong .REALNAME, TOC[tc0], 0;\ .csect .text[PR], 5;\ -.REALNAME:; +.REALNAME: #define EPILOGUE \ _section_.text:;\ From 853a18bc17628fb1e8615503304ceedef9d45030 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Thu, 14 Mar 2019 10:42:04 +0000 Subject: [PATCH 510/935] power9 makefile. dgemm based on power8 kernel with following changes : 32x unrolled 16x4 kernel and 8x4 kernel using (lxv stxv butterfly rank1 update). improvement from 17 to 22-23gflops. dtrmm cases were added into dgemm itself --- Makefile.power | 10 +- TargetList.txt | 1 + common.h | 5 + common_power.h | 8 +- cpuid_power.c | 8 +- getarch.c | 12 + kernel/Makefile.L3 | 4 + kernel/power/KERNEL.POWER9 | 184 ++ kernel/power/casum.c | 2 +- kernel/power/ccopy.c | 2 +- kernel/power/crot.c | 2 +- kernel/power/cswap.c | 2 +- kernel/power/dasum.c | 2 +- kernel/power/daxpy.c | 2 +- kernel/power/dcopy.c | 2 +- kernel/power/ddot.c | 2 +- kernel/power/dgemm_kernel_power9.S | 249 ++ kernel/power/dgemm_logic_power9.S | 1981 +++++++++++++++ kernel/power/dgemm_macros_power9.S | 3623 ++++++++++++++++++++++++++++ kernel/power/dgemv_n.c | 2 +- kernel/power/drot.c | 2 +- kernel/power/dscal.c | 2 +- kernel/power/dswap.c | 2 +- kernel/power/sasum.c | 2 +- kernel/power/scopy.c | 2 +- kernel/power/sdot.c | 2 +- kernel/power/srot.c | 2 +- kernel/power/sscal.c | 2 +- kernel/power/sswap.c | 2 +- kernel/power/zasum.c | 2 +- kernel/power/zaxpy.c | 7 +- kernel/power/zcopy.c | 2 +- kernel/power/zdot.c | 2 +- kernel/power/zscal.c | 2 +- kernel/power/zswap.c | 2 +- param.h | 31 + 36 files changed, 6133 insertions(+), 36 deletions(-) create mode 100644 kernel/power/KERNEL.POWER9 create mode 100644 kernel/power/dgemm_kernel_power9.S create mode 100644 kernel/power/dgemm_logic_power9.S create mode 100644 kernel/power/dgemm_macros_power9.S diff --git a/Makefile.power b/Makefile.power index a49372ad7..195f1930f 100644 --- a/Makefile.power +++ b/Makefile.power @@ -9,7 +9,15 @@ else USE_OPENMP = 1 endif - +ifeq ($(CORE), POWER9) +ifeq ($(USE_OPENMP), 1) +COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +endif +endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) diff --git a/TargetList.txt b/TargetList.txt index 3d04a57cf..44e539c09 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -48,6 +48,7 @@ POWER5 POWER6 POWER7 POWER8 +POWER9 PPCG4 PPC970 PPC970MP diff --git a/common.h b/common.h index 7fcd5e316..b30a71ff1 100644 --- a/common.h +++ b/common.h @@ -348,6 +348,11 @@ typedef int blasint; #endif #endif +#ifdef POWER9 +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif /* #ifdef PILEDRIVER diff --git a/common_power.h b/common_power.h index e3a1a7aef..ddbee9412 100644 --- a/common_power.h +++ b/common_power.h @@ -39,7 +39,7 @@ #ifndef COMMON_POWER #define COMMON_POWER -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #else @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst @@ -802,7 +802,7 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) +#elif defined(POWER8) || defined(POWER9) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) diff --git a/cpuid_power.c b/cpuid_power.c index 82a3f4aac..d5ba6fb2c 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -94,7 +94,7 @@ char *corename[] = { "CELL", "PPCG4", "POWER8", - "POWER8" + "POWER9" }; int detect(void){ @@ -124,7 +124,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -156,7 +156,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; @@ -180,7 +180,7 @@ int id; __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 - return CPUTYPE_POWER8; + return CPUTYPE_POWER9; break; case 0x4d: case 0x4b: // POWER8/8E diff --git a/getarch.c b/getarch.c index 78ba0fefd..34d46905a 100644 --- a/getarch.c +++ b/getarch.c @@ -618,6 +618,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER8" #endif +#if defined(FORCE_POWER9) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER9" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER9 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power9" +#define CORENAME "POWER9" +#endif #ifdef FORCE_PPCG4 #define FORCE diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 9258f216d..db9fccd30 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,6 +44,10 @@ ifeq ($(CORE), POWER8) USE_TRMM = 1 endif +ifeq ($(CORE), POWER9) +USE_TRMM = 1 +endif + ifeq ($(ARCH), zarch) USE_TRMM = 1 endif diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 new file mode 100644 index 000000000..86a931971 --- /dev/null +++ b/kernel/power/KERNEL.POWER9 @@ -0,0 +1,184 @@ +#SGEMM_BETA = ../generic/gemm_beta.c +#DGEMM_BETA = ../generic/gemm_beta.c +#CGEMM_BETA = ../generic/zgemm_beta.c +#ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = strmm_kernel_16x8_power8.S +DTRMMKERNEL = dgemm_kernel_power9.S +CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S + +SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_power8.S +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = cgemm_tcopy_8_power8.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o + +ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +#SAMAXKERNEL = ../arm/amax.c +#DAMAXKERNEL = ../arm/amax.c +#CAMAXKERNEL = ../arm/zamax.c +#ZAMAXKERNEL = ../arm/zamax.c +# +#SAMINKERNEL = ../arm/amin.c +#DAMINKERNEL = ../arm/amin.c +#CAMINKERNEL = ../arm/zamin.c +#ZAMINKERNEL = ../arm/zamin.c +# +#SMAXKERNEL = ../arm/max.c +#DMAXKERNEL = ../arm/max.c +# +#SMINKERNEL = ../arm/min.c +#DMINKERNEL = ../arm/min.c +# +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c +# +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +IZAMINKERNEL = izamin.c +# +#ISMAXKERNEL = ../arm/imax.c +#IDMAXKERNEL = ../arm/imax.c +# +#ISMINKERNEL = ../arm/imin.c +#IDMINKERNEL = ../arm/imin.c +# +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c +# +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c +# +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c +# +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +DSDOTKERNEL = sdot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c +# +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c +# +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c +# +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c +# +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c +# + +SGEMVNKERNEL = sgemv_n.c +DGEMVNKERNEL = dgemv_n.c +CGEMVNKERNEL = cgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c +# +SGEMVTKERNEL = sgemv_t.c +DGEMVTKERNEL = dgemv_t.c +CGEMVTKERNEL = cgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c + + +#SSYMV_U_KERNEL = ../generic/symv_k.c +#SSYMV_L_KERNEL = ../generic/symv_k.c +#DSYMV_U_KERNEL = ../generic/symv_k.c +#DSYMV_L_KERNEL = ../generic/symv_k.c +#QSYMV_U_KERNEL = ../generic/symv_k.c +#QSYMV_L_KERNEL = ../generic/symv_k.c +#CSYMV_U_KERNEL = ../generic/zsymv_k.c +#CSYMV_L_KERNEL = ../generic/zsymv_k.c +#ZSYMV_U_KERNEL = ../generic/zsymv_k.c +#ZSYMV_L_KERNEL = ../generic/zsymv_k.c +#XSYMV_U_KERNEL = ../generic/zsymv_k.c +#XSYMV_L_KERNEL = ../generic/zsymv_k.c + +#ZHEMV_U_KERNEL = ../generic/zhemv_k.c +#ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/power/casum.c b/kernel/power/casum.c index d1108581d..a9ece0768 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "casum_microk_power8.c" #endif diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index ce7d67475..50df84cc5 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "ccopy_microk_power8.c" #endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 40e350ba3..959a9eda0 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index da97c896e..31e02fe5a 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 73962c2f2..d0e060977 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" #endif diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index df0572e8e..f09611ff0 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "daxpy_microk_power8.c" #endif diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index 059c0e5a9..27b39144b 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dcopy_microk_power8.c" #endif diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index e43470e23..f985df1c5 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "ddot_microk_power8.c" #endif diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S new file mode 100644 index 000000000..a1762dcf2 --- /dev/null +++ b/kernel/power/dgemm_kernel_power9.S @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld + + + + +#define STACKSIZE (512 ) +#define ALPHA_SP (296+192)(SP) +#define FZERO (304+192)(SP) + + + +#define M r3 +#define N r4 +#define K r5 + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs18 + +#define o0 0 + + +#define T4 r12 +#define T3 r11 +#define C4 r14 +#define o8 r15 +#define o24 r16 +#define C2 r17 +#define L r18 +#define T1 r19 +#define C3 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_power9.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + stfd f1, ALPHA_SP + stw r0, FZERO + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + + + addi T1, SP, 296+192 + + + li PRE, 384 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + + lxvdsx alpha_r, 0, T1 + +#include "dgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S new file mode 100644 index 000000000..251839d19 --- /dev/null +++ b/kernel/power/dgemm_logic_power9.S @@ -0,0 +1,1981 @@ +/*************************************************************************** +Copyright (c) 2013-2019 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 2 + ble LDGEMM_L4_END + +LDGEMM_L4_BEGIN: + + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LDGEMM_L4x16_END + + MY_ALIGN +LDGEMM_L4x16_BEGIN: + + li L, -128 + + + SAVE4x16_REGS + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + + and T1, CO, L + and T2, C2, L + and T3, C3, L + and T4, C4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 + srawi. L, T3, 5 +#else + srawi. L, K, 5 +#endif + + ble LDGEMM_L4x16_SUB0 + + + MY_ALIGN +LDGEMM_L4x16_LOOP_START: + + li T2, 512 + + + LOAD4x16_1 + ##OffsetA=128 OffsetB=32 + addi AO,AO,2176 + # addi BO,BO,32 + addic. L, L, -1 + + ble LDGEMM_L4x16_LOOP_END + + + mtctr L + + MY_ALIGN + +LDGEMM_L4x16_LOOP: + + #dcbt AO, PRE + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_2 -2048,32, 15,1 + + + bdnz LDGEMM_L4x16_LOOP + + MY_ALIGN + MY_ALIGN +LDGEMM_L4x16_LOOP_END: + + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_3 -2048,32, 15,1 + b LDGEMM_L4x16_SUB1 + + + MY_ALIGN +LDGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + KERNEL4x16 1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 + MY_ALIGN +LDGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + ble LDGEMM_L4x16_SAVE + MY_ALIGN +LDGEMM_L4x16_SUB2: + + andi. T1,L, 16 + ble LDGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_2 128,32, 3,0 + KERNEL4x16_I1_L2_2 128,32, 4,0 + KERNEL4x16_I1_L2_2 128,32, 5,0 + KERNEL4x16_I1_L2_2 128,32, 6,0 + KERNEL4x16_I1_L2_3 128,32, 7,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_8: + andi. T1,L, 8 + ble LDGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_3 128,32, 3,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_3 128,32, 1,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 128,32, 0,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LDGEMM_L4x16_SUB2 + + MY_ALIGN +LDGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LDGEMM_L4x16_BEGIN + +LDGEMM_L4x16_END: + +LDGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L4x1_END + + andi. T1, M, 8 + ble LDGEMM_L4x8_END + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 + srawi. L, T3, 4 +#else + mr BO, B + srawi. L, K, 4 +#endif + + + ble LDGEMM_L4x8_SUB0 + +LDGEMM_L4x8_LOOP_START: + + + LOAD4x8_1 + ##OffsetA=64 OffsetB=32 + + + addic. L, L, -1 + + ble LDGEMM_L4x8_LOOP_END + + mtctr L + MY_ALIGN + +LDGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_2 64,32, 7,1 + + bdnz LDGEMM_L4x8_LOOP + MY_ALIGN +LDGEMM_L4x8_LOOP_END: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_3 64,32, 7,1 + + b LDGEMM_L4x8_SUB1 + MY_ALIGN +LDGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + KERNEL4x8 1 + + addic. L, L, -1 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 + MY_ALIGN +LDGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + ble LDGEMM_L4x8_SAVE + MY_ALIGN +LDGEMM_L4x8_SUB2: + + andi. T1,L, 8 + ble LDGEMM_L4x8_SUB2_4 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_3 64,32, 3,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_3 64,32, 1,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 64,32, 0,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x8_SAVE + KERNEL4x8 0 + + MY_ALIGN +LDGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 +#endif +LDGEMM_L4x8_END: + +LDGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x4_SUB4 + +LDGEMM_L4x4_LOOP_START: + + #dcbt AO, PRE + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -2 + ble LDGEMM_L4x4_LOOP_END + + MY_ALIGN + +LDGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -1 + bgt LDGEMM_L4x4_LOOP + +LDGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x4_SAVE + +LDGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SAVE: + + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 +#endif +LDGEMM_L4x4_END: + +LDGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x2_SUB4 + +LDGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble LDGEMM_L4x2_LOOP_END + + MY_ALIGN + +LDGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt LDGEMM_L4x2_LOOP + +LDGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x2_SAVE + +LDGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SAVE: + + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 +#endif +LDGEMM_L4x2_END: + +LDGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x1_SUB4 + +LDGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble LDGEMM_L4x1_LOOP_END + + MY_ALIGN + +LDGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt LDGEMM_L4x1_LOOP + +LDGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x1_SAVE + +LDGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SAVE: + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 +#endif +LDGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + addic. J, J, -1 + bgt LDGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999 + +LDGEMM_L4_END: + + b LDGEMM_L2_BEGIN + +.L999_H1: + + b .L999 + +LDGEMM_L2_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 2 + ble LDGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble LDGEMM_L2x16_END + +LDGEMM_L2x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x16_SUB4 + +LDGEMM_L2x16_LOOP_START: + + #dcbt AO, PRE + LOAD2x16_1 + #dcbt AO, PRE + KERNEL2x16_I1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble LDGEMM_L2x16_LOOP_END + + MY_ALIGN + +LDGEMM_L2x16_LOOP: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt LDGEMM_L2x16_LOOP + +LDGEMM_L2x16_LOOP_END: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB4: + + #dcbt AO, PRE + KERNEL2x16_SUBI1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x16_SAVE + +LDGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SAVE: + + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt LDGEMM_L2x16_BEGIN + +LDGEMM_L2x16_END: + +LDGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L2x1_END + + andi. T1, M, 8 + ble LDGEMM_L2x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x8_SUB4 + +LDGEMM_L2x8_LOOP_START: + + #dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble LDGEMM_L2x8_LOOP_END + + MY_ALIGN + +LDGEMM_L2x8_LOOP: + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt LDGEMM_L2x8_LOOP + +LDGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x8_SAVE + +LDGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SAVE: + + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 +#endif +LDGEMM_L2x8_END: + +LDGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x4_SUB4 + +LDGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble LDGEMM_L2x4_LOOP_END + + MY_ALIGN + +LDGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt LDGEMM_L2x4_LOOP + +LDGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x4_SAVE + +LDGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SAVE: + + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 +#endif +LDGEMM_L2x4_END: + +LDGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x2_SUB4 + +LDGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble LDGEMM_L2x2_LOOP_END + + MY_ALIGN + +LDGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt LDGEMM_L2x2_LOOP + +LDGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x2_SAVE + +LDGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SAVE: + + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 +#endif +LDGEMM_L2x2_END: + +LDGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x1_SUB4 + +LDGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble LDGEMM_L2x1_LOOP_END + + MY_ALIGN + +LDGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt LDGEMM_L2x1_LOOP + +LDGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x1_SAVE + +LDGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SAVE: + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 +#endif +LDGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 1 + ble LDGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble LDGEMM_L1x16_END + +LDGEMM_L1x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x16_SUB4 + +LDGEMM_L1x16_LOOP_START: + + #dcbt AO, PRE + LOAD1x16_1 + #dcbt AO, PRE + KERNEL1x16_I1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble LDGEMM_L1x16_LOOP_END + + MY_ALIGN + +LDGEMM_L1x16_LOOP: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt LDGEMM_L1x16_LOOP + +LDGEMM_L1x16_LOOP_END: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB4: + + #dcbt AO, PRE + KERNEL1x16_SUBI1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x16_SAVE + +LDGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SAVE: + + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt LDGEMM_L1x16_BEGIN + +LDGEMM_L1x16_END: + +LDGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L1x1_END + + andi. T1, M, 8 + ble LDGEMM_L1x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x8_SUB4 + +LDGEMM_L1x8_LOOP_START: + + #dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble LDGEMM_L1x8_LOOP_END + + MY_ALIGN + +LDGEMM_L1x8_LOOP: + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt LDGEMM_L1x8_LOOP + +LDGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x8_SAVE + +LDGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SAVE: + + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 +#endif +LDGEMM_L1x8_END: + +LDGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x4_SUB4 + +LDGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble LDGEMM_L1x4_LOOP_END + + MY_ALIGN + +LDGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt LDGEMM_L1x4_LOOP + +LDGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x4_SAVE + +LDGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SAVE: + + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 +#endif +LDGEMM_L1x4_END: + +LDGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x2_SUB4 + +LDGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble LDGEMM_L1x2_LOOP_END + + MY_ALIGN + +LDGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt LDGEMM_L1x2_LOOP + +LDGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x2_SAVE + +LDGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SAVE: + + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 +#endif +LDGEMM_L1x2_END: + +LDGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x1_SUB4 + +LDGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble LDGEMM_L1x1_LOOP_END + + MY_ALIGN + +LDGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt LDGEMM_L1x1_LOOP + +LDGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x1_SAVE + +LDGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SAVE: + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 +#endif +LDGEMM_L1x1_END: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S new file mode 100644 index 000000000..c4b8270b8 --- /dev/null +++ b/kernel/power/dgemm_macros_power9.S @@ -0,0 +1,3623 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + xxlxor vs36,vs36,vs36 + xxlxor vs37,vs37,vs37 + xxlxor vs38,vs38,vs38 + xxlxor vs39,vs39,vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + + +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro KERNEL4x16_L1_L2 Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete + +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 +.else + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 +.endif + lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) + lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + +.else + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 +.endif + lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) +.endif + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) + lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 +.if \Complete==0 + lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) +.endif + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) +.endif + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 +.if \Complete==0 + lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) +.endif + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + + xvmaddadp vs60, vs12, vs31 + + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + + xvmaddadp vs63, vs15, vs31 + .if \IsLast==1 + .if \Complete==1 + addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) + .else + addi \AREG, \AREG, DISP32(\Index,256) + addi \BREG, \BREG, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x16 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) + + + + addi BO, BO, 32 + addi AO, AO, 128 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endif +.endm + +.macro SAVE4x16_REGS + add C2, CO, LDC + add C3, C2, LDC + add C4, C3, LDC +.endm + +.macro SAVE4x16 +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs24, 64(CO) + lxv vs26, 80(CO) + lxv vs28, 96(CO) + lxv vs30, 112(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C2) + lxv vs3, 16(C2) + lxv vs5, 32(C2) + lxv vs7, 48(C2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs25, 64(C2) + lxv vs27, 80(C2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 +#ifndef TRMMKERNEL + lxv vs29, 96(C2) + lxv vs31, 112(C2) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + xxpermdi vs8, vs44,vs36,1 + xxpermdi vs9 ,vs36,vs44,1 + xxpermdi vs10, vs45,vs37,1 + xxpermdi vs11 ,vs37,vs45,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + xxpermdi vs12, vs46,vs38,1 + xxpermdi vs13 ,vs38,vs46,1 + xxpermdi vs14, vs47,vs39,1 + xxpermdi vs15 ,vs39,vs47,1 + +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r + +#endif + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + stxv vs24, 64(CO) + stxv vs26, 80(CO) + stxv vs28, 96(CO) + stxv vs30, 112(CO) + + stxv vs1, 0(C2) + stxv vs3, 16(C2) + stxv vs5, 32(C2) + stxv vs7, 48(C2) + + stxv vs25, 64(C2) + stxv vs27, 80(C2) + stxv vs29, 96(C2) + stxv vs31, 112(C2) +#ifndef TRMMKERNEL + lxv vs0, 0(C3) + lxv vs2, 16(C3) + lxv vs4, 32(C3) + lxv vs6, 48(C3) +#endif + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs24, 64(C3) + lxv vs26, 80(C3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs28, 96(C3) + lxv vs30, 112(C3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C4) + lxv vs3, 16(C4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(C4) + lxv vs7, 48(C4) + + lxv vs25, 64(C4) + lxv vs27, 80(C4) + lxv vs29, 96(C4) + lxv vs31, 112(C4) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + + xxpermdi vs8, vs60,vs52,1 + xxpermdi vs9 ,vs52,vs60,1 + xxpermdi vs10, vs61,vs53,1 + xxpermdi vs11 ,vs53,vs61,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + + + xxpermdi vs12, vs62,vs54,1 + xxpermdi vs13 ,vs54,vs62,1 + xxpermdi vs14, vs63,vs55,1 + xxpermdi vs15 ,vs55,vs63,1 +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r +#endif + stxv vs0, 0(C3) + stxv vs2, 16(C3) + stxv vs4, 32(C3) + stxv vs6, 48(C3) + + stxv vs24, 64(C3) + stxv vs26, 80(C3) + stxv vs28, 96(C3) + stxv vs30, 112(C3) + + stxv vs1, 0(C4) + stxv vs3, 16(C4) + stxv vs5, 32(C4) + stxv vs7, 48(C4) + + stxv vs25, 64(C4) + stxv vs27, 80(C4) + stxv vs29, 96(C4) + stxv vs31, 112(C4) + + addi CO, CO, 128 +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + +.endif +.endm + + + +.macro KERNEL4x8_L1_L2 Index,IsLast + KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index,0+\OffsetA)(AO) + lxv vs9, DISP16(\Index,16+\OffsetA)(AO) +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + + lxv vs10, DISP16(\Index,32+\OffsetA)(AO) + lxv vs11, DISP16(\Index,48+\OffsetA)(AO) + + + +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + + lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) + lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(AO) + lxv vs1, DISP16(\Index,80+\OffsetA)(AO) +.endif + + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.if \Complete==0 + lxv vs2, DISP16(\Index,96+\OffsetA)(AO) + lxv vs3, DISP16(\Index,112+\OffsetA)(AO) +.endif + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) + lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) +.endif + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + + .if \IsLast==1 + .if \Complete==1 + addi AO, AO, DISP16(\Index,64+\OffsetA) + addi BO, BO, DISP8(\Index,32+\OffsetB) + .else + addi AO, AO, DISP16(\Index,128) + addi BO, BO, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x8 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + + + addi BO, BO, 32 + addi AO, AO, 64 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + +.endif +.endm + + + +.macro SAVE4x8 + add T2, CO, LDC + add T3, T2, LDC + add T4, T3, LDC +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T2) + lxv vs3, 16(T2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T2) + lxv vs7, 48(T2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 + + + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + + stxv vs1, 0(T2) + stxv vs3, 16(T2) + stxv vs5, 32(T2) + stxv vs7, 48(T2) + + + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs0, 0(T3) + lxv vs2, 16(T3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs4, 32(T3) + lxv vs6, 48(T3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T4) + lxv vs3, 16(T4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T4) + lxv vs7, 48(T4) + + + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(T3) + stxv vs2, 16(T3) + stxv vs4, 32(T3) + stxv vs6, 48(T3) + + + stxv vs1, 0(T4) + stxv vs3, 16(T4) + stxv vs5, 32(T4) + stxv vs7, 48(T4) + + + + addi CO, CO, 64 +.endm + + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index 57f9f9e72..b458e11fc 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dgemv_n_microk_power8.c" #endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 3e107486f..baeb54205 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" #endif diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index f32dc4bad..779a08e9c 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" #endif diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index fd2dec9c4..52b7f50da 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" #endif diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index fb10b1d27..5908347d3 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" #endif diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 167c29bab..5e3fe45a5 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "scopy_microk_power8.c" #endif diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index 4fdc2f5b5..ae527dde9 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sdot_microk_power8.c" #endif diff --git a/kernel/power/srot.c b/kernel/power/srot.c index d2910ff87..6af813c16 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" #endif diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index bd5cdc43f..4f3ba5698 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" #endif diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 932652b37..23d13280f 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" #endif diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index 0b6b87d46..f61c62e75 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zasum_microk_power8.c" #endif diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index dd7ab6c3c..f0f8c6910 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -36,19 +36,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zaxpy_microk_power8.c" #endif #ifndef HAVE_KERNEL_4 -static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { BLASLONG register i = 0; BLASLONG register ix = 0; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; + while(i < n) diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index a7658f7ab..b21d6ef15 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zcopy_microk_power8.c" #endif diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index b83f832b1..fd36c7f44 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zdot_microk_power8.c" #endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 14d677f24..a1b441d2c 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 5ec1eee2e..1d8826f41 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" #endif diff --git a/param.h b/param.h index fa6730208..938a82a9e 100644 --- a/param.h +++ b/param.h @@ -2230,6 +2230,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(POWER9) + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 1280 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 640 +#define ZGEMM_DEFAULT_P 320 + +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 640 + +#define SYMV_P 8 + +#endif #if defined(SPARC) && defined(V7) From 4f9d3e4b28e9a5dfbe70e0a4f4f54517e5b3d6ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 12:37:13 +0100 Subject: [PATCH 511/935] Expose CBLAS interfaces for I?MIN and I?MAX --- cblas.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cblas.h b/cblas.h index d340a2037..e3dacb737 100644 --- a/cblas.h +++ b/cblas.h @@ -88,6 +88,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + +CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); From 3d1e36d4cb15eb94098d2ab0a3413413c7aec2c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 12:38:41 +0100 Subject: [PATCH 512/935] Build CBLAS interfaces for I?MIN and I?MAX --- interface/Makefile | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 20ec74e9e..2b996c7de 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -263,7 +263,8 @@ CSBLAS1OBJS = \ cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ - cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) + cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -280,7 +281,8 @@ CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ - cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) + cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -300,7 +302,8 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) + cblas_caxpby.$(SUFFIX) \ + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -326,7 +329,9 @@ CZBLAS1OBJS = \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) + cblas_zaxpby.$(SUFFIX) \ + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) + CZBLAS2OBJS = \ cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ @@ -1383,6 +1388,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) +cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) From 5c42287c4fa88e295a5c0bc9b58e3915148408be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 21:58:03 +0100 Subject: [PATCH 513/935] Add declarations for ?sum and cblas_?sum --- cblas.h | 5 +++++ common_c.h | 2 ++ common_d.h | 2 ++ common_interface.h | 7 +++++++ common_level1.h | 7 +++++++ common_macro.h | 6 ++++++ common_param.h | 6 ++++++ common_s.h | 2 ++ common_z.h | 2 ++ 9 files changed, 39 insertions(+) diff --git a/cblas.h b/cblas.h index e3dacb737..1a87074d6 100644 --- a/cblas.h +++ b/cblas.h @@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); diff --git a/common_c.h b/common_c.h index ce0f2a5bd..40ecf5b8b 100644 --- a/common_c.h +++ b/common_c.h @@ -19,6 +19,7 @@ #define CDOTC_K cdotc_k #define CNRM2_K cnrm2_k #define CSCAL_K cscal_k +#define CSUM_K csum_k #define CSWAP_K cswap_k #define CROT_K csrot_k @@ -249,6 +250,7 @@ #define CDOTC_K gotoblas -> cdotc_k #define CNRM2_K gotoblas -> cnrm2_k #define CSCAL_K gotoblas -> cscal_k +#define CSUM_K gotoblas -> csum_k #define CSWAP_K gotoblas -> cswap_k #define CROT_K gotoblas -> csrot_k diff --git a/common_d.h b/common_d.h index ad9945186..94dc3eea8 100644 --- a/common_d.h +++ b/common_d.h @@ -19,6 +19,7 @@ #define DDOTC_K ddot_k #define DNRM2_K dnrm2_k #define DSCAL_K dscal_k +#define DSUM_K dsum_k #define DSWAP_K dswap_k #define DROT_K drot_k @@ -174,6 +175,7 @@ #define DDOTC_K gotoblas -> ddot_k #define DNRM2_K gotoblas -> dnrm2_k #define DSCAL_K gotoblas -> dscal_k +#define DSUM_K gotoblas -> dsum_k #define DSWAP_K gotoblas -> dswap_k #define DROT_K gotoblas -> drot_k diff --git a/common_interface.h b/common_interface.h index 15f69e02f..c350ac8ec 100644 --- a/common_interface.h +++ b/common_interface.h @@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); double BLASFUNC(dzasum)(blasint *, double *, blasint *); xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); +FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); +double BLASFUNC(dsum) (blasint *, double *, blasint *); +xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); +double BLASFUNC(dzsum)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); + blasint BLASFUNC(isamax)(blasint *, float *, blasint *); blasint BLASFUNC(idamax)(blasint *, double *, blasint *); blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); diff --git a/common_level1.h b/common_level1.h index 32ffd6f18..74cafb6db 100644 --- a/common_level1.h +++ b/common_level1.h @@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG); double zasum_k (BLASLONG, double *, BLASLONG); xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); +float ssum_k (BLASLONG, float *, BLASLONG); +double dsum_k (BLASLONG, double *, BLASLONG); +xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); +float csum_k (BLASLONG, float *, BLASLONG); +double zsum_k (BLASLONG, double *, BLASLONG); +xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); + float samax_k (BLASLONG, float *, BLASLONG); double damax_k (BLASLONG, double *, BLASLONG); xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 15ba6f9db..d2503aa65 100644 --- a/common_macro.h +++ b/common_macro.h @@ -66,6 +66,7 @@ #define DOTC_K QDOTC_K #define NRM2_K QNRM2_K #define SCAL_K QSCAL_K +#define SUM_K QSUM_K #define SWAP_K QSWAP_K #define ROT_K QROT_K @@ -356,6 +357,7 @@ #define DOTC_K DDOTC_K #define NRM2_K DNRM2_K #define SCAL_K DSCAL_K +#define SUM_K DSUM_K #define SWAP_K DSWAP_K #define ROT_K DROT_K @@ -658,6 +660,7 @@ #define DOTC_K SDOTC_K #define NRM2_K SNRM2_K #define SCAL_K SSCAL_K +#define SUM_K SSUM_K #define SWAP_K SSWAP_K #define ROT_K SROT_K @@ -962,6 +965,7 @@ #define DOTC_K XDOTC_K #define NRM2_K XNRM2_K #define SCAL_K XSCAL_K +#define SUM_K XSUM_K #define SWAP_K XSWAP_K #define ROT_K XROT_K @@ -1363,6 +1367,7 @@ #define DOTC_K ZDOTC_K #define NRM2_K ZNRM2_K #define SCAL_K ZSCAL_K +#define SUM_K ZSUM_K #define SWAP_K ZSWAP_K #define ROT_K ZROT_K @@ -1785,6 +1790,7 @@ #define DOTC_K CDOTC_K #define NRM2_K CNRM2_K #define SCAL_K CSCAL_K +#define SUM_K CSUM_K #define SWAP_K CSWAP_K #define ROT_K CROT_K diff --git a/common_param.h b/common_param.h index 8f162c01f..574d5e176 100644 --- a/common_param.h +++ b/common_param.h @@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); + float (*ssum_k) (BLASLONG, float *, BLASLONG); int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); + double (*dsum_k) (BLASLONG, double *, BLASLONG); int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); @@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); @@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); + float (*csum_k) (BLASLONG, float *, BLASLONG); int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); double (*znrm2_k) (BLASLONG, double *, BLASLONG); double (*zasum_k) (BLASLONG, double *, BLASLONG); + double (*zsum_k) (BLASLONG, double *, BLASLONG); int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); diff --git a/common_s.h b/common_s.h index 3c1600859..23c432f7c 100644 --- a/common_s.h +++ b/common_s.h @@ -12,6 +12,7 @@ #define ISMAX_K ismax_k #define ISMIN_K ismin_k #define SASUM_K sasum_k +#define SSUM_K ssum_k #define SAXPYU_K saxpy_k #define SAXPYC_K saxpy_k #define SCOPY_K scopy_k @@ -170,6 +171,7 @@ #define ISMAX_K gotoblas -> ismax_k #define ISMIN_K gotoblas -> ismin_k #define SASUM_K gotoblas -> sasum_k +#define SSUM_K gotoblas -> ssum_k #define SAXPYU_K gotoblas -> saxpy_k #define SAXPYC_K gotoblas -> saxpy_k #define SCOPY_K gotoblas -> scopy_k diff --git a/common_z.h b/common_z.h index b4f58bb0c..f1e78dd08 100644 --- a/common_z.h +++ b/common_z.h @@ -19,6 +19,7 @@ #define ZDOTC_K zdotc_k #define ZNRM2_K znrm2_k #define ZSCAL_K zscal_k +#define ZSUM_K zsum_k #define ZSWAP_K zswap_k #define ZROT_K zdrot_k @@ -249,6 +250,7 @@ #define ZDOTC_K gotoblas -> zdotc_k #define ZNRM2_K gotoblas -> znrm2_k #define ZSCAL_K gotoblas -> zscal_k +#define ZSUM_K gotoblas -> zsum_k #define ZSWAP_K gotoblas -> zswap_k #define ZROT_K gotoblas -> zdrot_k From 79cfc24a6208b869ec79ee26d3f3eab6af3b8aea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 21:59:18 +0100 Subject: [PATCH 514/935] Add interface for ?sum (derived from ?asum) --- interface/CMakeLists.txt | 3 ++ interface/Makefile | 56 +++++++++++++++++------ interface/sum.c | 97 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 13 deletions(-) create mode 100644 interface/sum.c diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 8b25344c0..f76d5c13f 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES rotm.c rotmg.c # N.B. these do not have complex counterparts rot.c asum.c + sum.c ) # these will have 'z' prepended for the complex version @@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") endif () if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") @@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") endif () endforeach () diff --git a/interface/Makefile b/interface/Makefile index 2b996c7de..f0577796d 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -25,7 +25,7 @@ SBLAS1OBJS = \ saxpy.$(SUFFIX) sswap.$(SUFFIX) \ scopy.$(SUFFIX) sscal.$(SUFFIX) \ sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ - sasum.$(SUFFIX) snrm2.$(SUFFIX) \ + sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ @@ -51,7 +51,7 @@ DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ dcopy.$(SUFFIX) dscal.$(SUFFIX) \ ddot.$(SUFFIX) \ - dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ + dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ @@ -76,7 +76,7 @@ CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ - scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ + scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ scamax.$(SUFFIX) icamax.$(SUFFIX) \ scamin.$(SUFFIX) icamin.$(SUFFIX) \ csrot.$(SUFFIX) crotg.$(SUFFIX) \ @@ -105,7 +105,7 @@ ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ - dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ + dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ dzamax.$(SUFFIX) izamax.$(SUFFIX) \ dzamin.$(SUFFIX) izamin.$(SUFFIX) \ zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ @@ -146,7 +146,7 @@ QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ qdot.$(SUFFIX) \ - qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ @@ -168,7 +168,7 @@ XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ - qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ @@ -203,7 +203,7 @@ ifdef QUAD_PRECISION QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ - qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ @@ -224,7 +224,7 @@ QBLAS3OBJS = \ XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ - qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ @@ -264,7 +264,7 @@ CSBLAS1OBJS = \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -282,7 +282,7 @@ CDBLAS1OBJS = \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -303,7 +303,7 @@ CCBLAS1OBJS = \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -330,7 +330,7 @@ CZBLAS1OBJS = \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) CZBLAS2OBJS = \ @@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) +ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1419,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) diff --git a/interface/sum.c b/interface/sum.c new file mode 100644 index 000000000..dfdcc5dcc --- /dev/null +++ b/interface/sum.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)SUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#else +#ifdef COMPLEX +FLOAT CNAME(blasint n, void *vx, blasint incx){ + FLOAT *x = (FLOAT*) vx; +#else +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ +#endif + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = SUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#endif From b9f4943a14ef8ff4a1bde192f491b2efa02eff40 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:01:13 +0100 Subject: [PATCH 515/935] Add ?sum --- kernel/CMakeLists.txt | 1 + kernel/Makefile.L1 | 61 ++++++++++++++++++++++++++++++++++++++----- kernel/setparam-ref.c | 12 ++++----- 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 2a330df4e..ad15b8f25 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index a8f9cf097..970703230 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -340,6 +340,32 @@ ifndef XSCALKERNEL XSCALKERNEL = zscal.S endif +### SUM ### + +ifndef SSUMKERNEL +SSUMKERNEL = sum.S +endif + +ifndef DSUMKERNEL +DSUMKERNEL = sum.S +endif + +ifndef CSUMKERNEL +CSUMKERNEL = zsum.S +endif + +ifndef ZSUMKERNEL +ZSUMKERNEL = zsum.S +endif + +ifndef QSUMKERNEL +QSUMKERNEL = sum.S +endif + +ifndef XSUMKERNEL +XSUMKERNEL = zsum.S +endif + ### SWAP ### ifndef SSWAPKERNEL @@ -453,7 +479,7 @@ endif SBLASOBJS += \ samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ - sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ + sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ saxpby_k$(TSUFFIX).$(SUFFIX) @@ -463,31 +489,32 @@ DBLASOBJS += \ idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ - daxpby_k$(TSUFFIX).$(SUFFIX) + daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ - qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) + qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ + qsum_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ - cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) + cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ - zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) + zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) XBLASOBJS += \ xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ - xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) + xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) ### AMAX ### @@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ - +### ASUM ### $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ @@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ +### SUM ### +$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ + +### AXPY ### $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 6d4028b0b..2985003f3 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = { samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, @@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = { damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, - dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, + dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS, drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, dgemv_nTS, dgemv_tTS, dger_kTS, dsymv_LTS, dsymv_UTS, @@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = { qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, - qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, + qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, @@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = { #endif camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, - cnrm2_kTS, casum_kTS, ccopy_kTS, + cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS, cdotu_kTS, cdotc_kTS, csrot_kTS, caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, @@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = { #endif zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, - znrm2_kTS, zasum_kTS, zcopy_kTS, + znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS, zdotu_kTS, zdotc_kTS, zdrot_kTS, zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, @@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = { XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, - xnrm2_kTS, xasum_kTS, xcopy_kTS, + xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS, xdotu_kTS, xdotc_kTS, xqrot_kTS, xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, From c3cfc6986b9b2b38af7324591dd4a54c21a093a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:05:11 +0100 Subject: [PATCH 516/935] Add implementations of ssum/dsum and csum/zsum as trivial copies of asum/zsasum with the fabs calls replaced by fmov to preserve code structure --- kernel/alpha/sum.S | 206 +++++++++++++++++++++++++++++++++++++++++++ kernel/alpha/zsum.S | 208 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 kernel/alpha/sum.S create mode 100644 kernel/alpha/zsum.S diff --git a/kernel/alpha/sum.S b/kernel/alpha/sum.S new file mode 100644 index 000000000..3902817a7 --- /dev/null +++ b/kernel/alpha/sum.S @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * 2 * SIZE(X) + fmov a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a7, 0 * SIZE(X) + fmov a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fmov a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, s0 + LD a1, 0 * SIZE(X) + fmov a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fmov a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a3, 0 * SIZE(X) + fmov a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fmov a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fmov a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a7, 0 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fmov a2, t2 + ADD s3, t3, s3 + fmov a3, t3 + + ADD s0, t0, s0 + fmov a4, t0 + ADD s1, t1, s1 + fmov a5, t1 + ADD s2, t2, s2 + fmov a6, t2 + ADD s3, t3, s3 + fmov a7, t3 + + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s1, s0 + ADD s2, s3, s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2, s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fmov a0, t0 + + lda I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ret + EPILOGUE diff --git a/kernel/alpha/zsum.S b/kernel/alpha/zsum.S new file mode 100644 index 000000000..1ad0eb137 --- /dev/null +++ b/kernel/alpha/zsum.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + addq INCX, INCX, INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + lda I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * SIZE(X) + fmov a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fmov a1, t1 + unop + + ADD s2, t2, s2 + LD a7, 1 * SIZE(X) + fmov a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fmov a3, t3 + unop + + ADD s0, t0, s0 + LD a1, 1 * SIZE(X) + fmov a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fmov a5, t1 + unop + + ADD s2, t2, s2 + LD a3, 1 * SIZE(X) + fmov a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fmov a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fmov a0, t0 + + ADD s1, t1, s1 + LD a7, 1 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fmov a2, t2 + ADD s3, t3, s3 + fmov a3, t3 + + ADD s0, t0, s0 + fmov a4, t0 + ADD s1, t1, s1 + fmov a5, t1 + ADD s2, t2, s2 + fmov a6, t2 + ADD s3, t3, s3 + fmov a7, t3 + + ADD s2, t2, s2 + ADD s3, t3, s3 + + .align 4 + +$L15: + ADD s0, s2, s0 + and N, 3, I + ADD s1, s3, s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + fmov a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a1, 1 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ADD s1, t1, s1 + + ADD s0, s1, s0 + ret + EPILOGUE From 94ab4e6fb262a03752cb1a54a5731cb8b0b2dc43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:11:38 +0100 Subject: [PATCH 517/935] Add ARM implementations of ?sum (trivial copies of the respective ?asum with the fabs calls removed) --- kernel/arm/KERNEL.ARMV5 | 5 + kernel/arm/KERNEL.ARMV6 | 3 + kernel/arm/sum.c | 51 +++++ kernel/arm/sum_vfp.S | 425 ++++++++++++++++++++++++++++++++++++++++ kernel/arm/zsum.c | 57 ++++++ 5 files changed, 541 insertions(+) create mode 100644 kernel/arm/sum.c create mode 100644 kernel/arm/sum_vfp.S create mode 100644 kernel/arm/zsum.c diff --git a/kernel/arm/KERNEL.ARMV5 b/kernel/arm/KERNEL.ARMV5 index 10808e2d9..e977dda3a 100644 --- a/kernel/arm/KERNEL.ARMV5 +++ b/kernel/arm/KERNEL.ARMV5 @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 960dae67b..b773a5ba0 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S +SSUMKERNEL = sum_vfp.S +DSUMKERNEL = sum_vfp.S + SAXPYKERNEL = axpy_vfp.S DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c new file mode 100644 index 000000000..7b78ec61a --- /dev/null +++ b/kernel/arm/sum.c @@ -0,0 +1,51 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* trivial copy of asum.c with the ABS() removed * +**************************************************************************************/ + + +#include "common.h" +#include + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += x[i]; + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/arm/sum_vfp.S b/kernel/arm/sum_vfp.S new file mode 100644 index 000000000..d33d99ed3 --- /dev/null +++ b/kernel/arm/sum_vfp.S @@ -0,0 +1,425 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + vldmia.f64 X!, { d4 - d5 } + vadd.f64 d0 , d0, d4 + vldmia.f64 X!, { d6 - d7 } + vadd.f64 d1 , d1, d5 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + +.endm + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 } + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + vldmia.f32 X!, { s4 - s5 } + vadd.f32 s0 , s0, s4 + vldmia.f32 X!, { s6 - s7 } + vadd.f32 s1 , s1, s5 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + +.endm + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 } + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + vldmia.f64 X!, { d4 - d5 } + vadd.f64 d0 , d0, d4 + vldmia.f64 X!, { d6 - d7 } + vadd.f64 d1 , d1, d5 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + pld [ X, #X_PRE ] + vldmia.f64 X!, { d4 - d5 } + vadd.f64 d0 , d0, d4 + vldmia.f64 X!, { d6 - d7 } + vadd.f64 d1 , d1, d5 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + +.endm + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 } + vadd.f64 d0 , d0, d4 + + vldmia.f64 X!, { d4 } + vadd.f64 d0 , d0, d4 + + +.endm + + +.macro KERNEL_S4 + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + vldmia.f32 X!, { s4 - s5 } + vadd.f32 s0 , s0, s4 + vldmia.f32 X!, { s6 - s7 } + vadd.f32 s1 , s1, s5 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + vldmia.f32 X!, { s4 - s5 } + vadd.f32 s0 , s0, s4 + vldmia.f32 X!, { s6 - s7 } + vadd.f32 s1 , s1, s5 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + +.endm + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 } + vadd.f32 s0 , s0, s4 + + vldmia.f32 X!, { s4 } + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + movs r12, #0 // clear floating point register + vmov s0, r12 + vmov s1, r12 +#if defined(DOUBLE) + vcvt.f64.f32 d0, s0 + vcvt.f64.f32 d1, s1 +#endif + + cmp N, #0 + ble asum_kernel_L999 + + cmp INC_X, #0 + beq asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + + +asum_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_F1 + + .align 5 + +asum_kernel_F4: + +#if !defined(DOUBLE) && !defined(COMPLEX) + pld [ X, #X_PRE ] +#endif + KERNEL_F4 + + subs I, I, #1 + ble asum_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + + b asum_kernel_L999 + +asum_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_S1 + + .align 5 + +asum_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + +asum_kernel_L999: + + +#if defined(DOUBLE) + vadd.f64 d0 , d0, d1 // set return value +#else + vadd.f32 s0 , s0, s1 // set return value +#endif + +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/zsum.c b/kernel/arm/zsum.c new file mode 100644 index 000000000..cd24f9995 --- /dev/null +++ b/kernel/arm/zsum.c @@ -0,0 +1,57 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* trivial copy of zasum.c with the ABS() removed * +**************************************************************************************/ + + +#include "common.h" +#include + +#define CSUM1(x,i) x[i]+x[i+1] + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CSUM1(x,i); + i += inc_x2; + } + return(sumf); +} + + From 3e3ccb90118e58a92e59a5e7e3cf3209b3d925f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:13:36 +0100 Subject: [PATCH 518/935] Add ARM64 implementations of ?sum as trivial copies of the respective ?asum kernels with the fabs calls removed --- kernel/arm64/csum.S | 164 ++++++++++++++++++++++++++++++++++++++ kernel/arm64/sum.S | 186 ++++++++++++++++++++++++++++++++++++++++++++ kernel/arm64/zsum.S | 158 +++++++++++++++++++++++++++++++++++++ 3 files changed, 508 insertions(+) create mode 100644 kernel/arm64/csum.S create mode 100644 kernel/arm64/sum.S create mode 100644 kernel/arm64/zsum.S diff --git a/kernel/arm64/csum.S b/kernel/arm64/csum.S new file mode 100644 index 000000000..90746bc39 --- /dev/null +++ b/kernel/arm64/csum.S @@ -0,0 +1,164 @@ +/******************************************************************************* +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2s}, [X], #8 + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 + ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] + add X, X, #64 + + PRFM PLDL1KEEP, [X, #1024] + + fadd v1.4s, v1.4s, v2.4s + fadd v3.4s, v3.4s, v4.4s + fadd v0.4s, v0.4s, v1.4s + fadd v0.4s, v0.4s, v3.4s +.endm + +.macro KERNEL_F8_FINALIZE + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +.endm + +.macro INIT_S + lsl INC_X, INC_X, #3 +.endm + +.macro KERNEL_S1 + ld1 {v1.2s}, [X], INC_X + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + fmov s1, SUMF + + cmp N, xzr + ble .Lcsum_kernel_L999 + cmp INC_X, xzr + ble .Lcsum_kernel_L999 + + cmp INC_X, #1 + bne .Lcsum_kernel_S_BEGIN + +.Lcsum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq .Lcsum_kernel_F1 + +.Lcsum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne .Lcsum_kernel_F8 + + KERNEL_F8_FINALIZE + +.Lcsum_kernel_F1: + + ands I, N, #7 + ble .Lcsum_kernel_L999 + +.Lcsum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne .Lcsum_kernel_F10 + +.Lcsum_kernel_L999: + ret + +.Lcsum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble .Lcsum_kernel_S1 + +.Lcsum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne .Lcsum_kernel_S4 + +.Lcsum_kernel_S1: + + ands I, N, #3 + ble .Lcsum_kernel_L999 + +.Lcsum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne .Lcsum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/sum.S b/kernel/arm64/sum.S new file mode 100644 index 000000000..16d0dc4e4 --- /dev/null +++ b/kernel/arm64/sum.S @@ -0,0 +1,186 @@ +/******************************************************************************* +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] + fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] + fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] + PRFM PLDL1KEEP, [X, #1024] +#else // DOUBLE + ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] + add X, X, #64 + + PRFM PLDL1KEEP, [X, #1024] + + fadd v2.2d, v2.2d, v3.2d + fadd v4.2d, v4.2d, v5.2d + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v4.2d +#endif +.endm + +.macro KERNEL_F8_FINALIZE +#if !defined(DOUBLE) + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +#else + faddp SUMF, v0.2d +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 +#else + lsl INC_X, INC_X, #3 +#endif +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 +#if !defined(DOUBLE) + fmov s1, SUMF +#else + fmov d1, SUMF +#endif + + cmp N, xzr + ble .Lsum_kernel_L999 + cmp INC_X, xzr + ble .Lsum_kernel_L999 + + cmp INC_X, #1 + bne .Lsum_kernel_S_BEGIN + +.Lsum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq .Lsum_kernel_F1 + +.Lsum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne .Lsum_kernel_F8 + + KERNEL_F8_FINALIZE + +.Lsum_kernel_F1: + + ands I, N, #7 + ble .Lsum_kernel_L999 + +.Lsum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne .Lsum_kernel_F10 + +.Lsum_kernel_L999: + ret + +.Lsum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble .Lsum_kernel_S1 + +.Lsum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne .Lsum_kernel_S4 + +.Lsum_kernel_S1: + + ands I, N, #3 + ble .Lsum_kernel_L999 + +.Lsum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne .Lsum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/zsum.S b/kernel/arm64/zsum.S new file mode 100644 index 000000000..67ea3cb4d --- /dev/null +++ b/kernel/arm64/zsum.S @@ -0,0 +1,158 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2d}, [X], #16 + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F4 + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 + + fadd v1.2d, v1.2d, v2.2d + fadd v3.2d, v3.2d, v4.2d + + fadd v0.2d, v0.2d, v1.2d + fadd v0.2d, v0.2d, v3.2d + + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F4_FINALIZE + faddp SUMF, v0.2d +.endm + +.macro INIT_S + lsl INC_X, INC_X, #4 +.endm + +.macro KERNEL_S1 + ld1 {v1.2d}, [X], INC_X + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + + cmp N, xzr + ble .Lzsum_kernel_L999 + cmp INC_X, xzr + ble .Lzsum_kernel_L999 + + cmp INC_X, #1 + bne .Lzsum_kernel_S_BEGIN + +.Lzsum_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq .Lzsum_kernel_F1 + +.Lzsum_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne .Lzsum_kernel_F4 + + KERNEL_F4_FINALIZE + +.Lzsum_kernel_F1: + + ands I, N, #3 + ble .Lzsum_kernel_L999 + +.Lzsum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne .Lzsum_kernel_F10 + +.Lzsum_kernel_L999: + ret + +.Lzsum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble .Lzsum_kernel_S1 + +.Lzsum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne .Lzsum_kernel_S4 + +.Lzsum_kernel_S1: + + ands I, N, #3 + ble .Lzsum_kernel_L999 + +.Lzsum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne .Lzsum_kernel_S10 + + ret + + EPILOGUE From f8b82bc6dc0c7650fa757c6eadf1906e0bc50950 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:18:03 +0100 Subject: [PATCH 519/935] Add ia64 implementation of ?sum as trivial copy of asum with the fabs calls removed --- kernel/ia64/KERNEL | 4 + kernel/ia64/sum.S | 358 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100644 kernel/ia64/sum.S diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL index 10a7e61e2..870aac473 100644 --- a/kernel/ia64/KERNEL +++ b/kernel/ia64/KERNEL @@ -60,6 +60,10 @@ CASUMKERNEL = asum.S ZASUMKERNEL = asum.S XASUMKERNEL = asum.S +CSUMKERNEL = sum.S +ZSUMKERNEL = sum.S +XSUMKERNEL = sum.S + CNRM2KERNEL = nrm2.S ZNRM2KERNEL = nrm2.S XNRM2KERNEL = nrm2.S diff --git a/kernel/ia64/sum.S b/kernel/ia64/sum.S new file mode 100644 index 000000000..561d5d771 --- /dev/null +++ b/kernel/ia64/sum.S @@ -0,0 +1,358 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2019, The OpenBLAS project */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#ifndef COMPLEX +#define COMPADD 0 +#define STRIDE INCX +#else +#define COMPADD 1 +#define STRIDE SIZE +#endif + +#define PRE1 r2 + +#define I r17 +#define J r18 +#define INCX16 r21 + +#define PR r30 +#define ARLC r31 + +#define N r32 +#define X r33 +#define INCX r34 + + + PROLOGUE + .prologue + PROFCODE + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + { .mmi + cmp.lt p0, p6 = r0, INCX + cmp.lt p0, p7 = r0, N + shr I = N, (4 - COMPADD) + } + { .mbb + and J = ((1 << (4 - COMPADD)) - 1), N + (p6) br.ret.sptk.many b0 + (p7) br.ret.sptk.many b0 + } + ;; + { .mfi + adds I = -1, I + mov f10 = f0 + mov PR = pr + } + { .mfi + cmp.eq p9, p0 = r0, J + mov f9 = f0 + tbit.z p0, p12 = N, 3 - COMPADD + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + cmp.ne p17, p0 = r0, r0 + mov ar.ec= 3 + } + { .mfi + cmp.ne p18, p0 = r0, r0 + mov f11 = f0 + shl INCX = INCX, BASE_SHIFT + COMPADD + } + ;; + { .mmi +#ifdef XDOUBLE + shladd INCX16 = INCX, (3 - COMPADD), r0 +#else + shladd INCX16 = INCX, (4 - COMPADD), r0 +#endif + cmp.ne p19, p0 = r0, r0 + mov ar.lc = I + } + { .mmb + cmp.gt p8 ,p0 = r0, I +#ifdef COMPLEX + adds INCX = - SIZE, INCX +#else + nop.m 0 +#endif + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X], STRIDE + } + { .mfb + (p19) FADD f8 = f8, f71 + } + ;; + { .mmf + (p16) LDFD f35 = [X], INCX + } + { .mfb + (p19) FADD f9 = f9, f74 + } + ;; + { .mmf + (p16) LDFD f38 = [X], STRIDE + } + { .mfb + (p19) FADD f10 = f10, f77 + } + ;; + { .mmf + (p16) LDFD f41 = [X], INCX + } + { .mfb + (p19) FADD f11 = f11, f80 + } + ;; + { .mmf + (p16) LDFD f44 = [X], STRIDE + } + { .mfb + (p18) FADD f8 = f8, f34 + } + ;; + { .mmf + (p16) LDFD f47 = [X], INCX + } + { .mfb + (p18) FADD f9 = f9, f37 + } + ;; + { .mmf + (p16) LDFD f50 = [X], STRIDE + } + { .mfb + (p18) FADD f10 = f10, f40 + } + ;; + { .mmf + (p16) LDFD f53 = [X], INCX + } + { .mfb + (p18) FADD f11 = f11, f43 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f56 = [X], STRIDE + } + { .mfb + (p18) FADD f8 = f8, f46 + } + ;; + { .mmf + (p16) LDFD f59 = [X], INCX + } + { .mfb + (p18) FADD f9 = f9, f49 + } + ;; + { .mmf + (p16) LDFD f62 = [X], STRIDE + } + { .mfb + (p18) FADD f10 = f10, f52 + } + ;; + { .mmf + (p16) LDFD f65 = [X], INCX + } + { .mfb + (p18) FADD f11 = f11, f55 + } + ;; + { .mmf + (p16) LDFD f68 = [X], STRIDE + } + { .mfb + (p18) FADD f8 = f8, f58 + } + ;; + { .mmf + (p16) LDFD f71 = [X], INCX + } + { .mfb + (p18) FADD f9 = f9, f61 + } + ;; + { .mmf + (p16) LDFD f74 = [X], STRIDE + } + { .mfb + (p18) FADD f10 = f10, f64 + } + ;; + { .mmf + (p16) LDFD f77 = [X], INCX + } + { .mfb + (p18) FADD f11 = f11, f67 + br.ctop.sptk.few .L52 + } + ;; + FADD f8 = f8, f71 + FADD f9 = f9, f74 + FADD f10 = f10, f77 + FADD f11 = f11, f80 + .align 32 + ;; +.L55: + (p12) LDFD f32 = [X], STRIDE + (p9) br.cond.dptk .L998 + ;; + (p12) LDFD f33 = [X], INCX + ;; + (p12) LDFD f34 = [X], STRIDE + ;; + (p12) LDFD f35 = [X], INCX + tbit.z p0, p13 = N, (2 - COMPADD) + ;; + (p12) LDFD f36 = [X], STRIDE + tbit.z p0, p14 = N, (1 - COMPADD) + ;; + (p12) LDFD f37 = [X], INCX +#ifndef COMPLEX + tbit.z p0, p15 = N, 0 +#endif + ;; + (p12) LDFD f38 = [X], STRIDE + ;; + (p12) LDFD f39 = [X], INCX + ;; + (p13) LDFD f40 = [X], STRIDE + ;; + (p13) LDFD f41 = [X], INCX + ;; + (p13) LDFD f42 = [X], STRIDE + (p12) FADD f8 = f8, f32 + ;; + (p13) LDFD f43 = [X], INCX + (p12) FADD f9 = f9, f33 + ;; + (p14) LDFD f44 = [X], STRIDE + (p12) FADD f10 = f10, f34 + ;; + (p14) LDFD f45 = [X], INCX + (p12) FADD f11 = f11, f35 + ;; +#ifndef COMPLEX + (p15) LDFD f46 = [X] +#endif + (p12) FADD f8 = f8, f36 + ;; + (p12) FADD f9 = f9, f37 + (p12) FADD f10 = f10, f38 + (p12) FADD f11 = f11, f39 + ;; + (p13) FADD f8 = f8, f40 + (p13) FADD f9 = f9, f41 +#ifndef COMPLEX +#endif + (p13) FADD f10 = f10, f42 + ;; + (p13) FADD f11 = f11, f43 + (p14) FADD f8 = f8, f44 + (p14) FADD f9 = f9, f45 +#ifndef COMPLEX + (p15) FADD f10 = f10, f46 +#endif + ;; + .align 32 + +.L998: + { .mfi + FADD f8 = f8, f9 + mov ar.lc = ARLC + } + { .mmf + FADD f10 = f10, f11 + } + ;; + { .mii + mov pr = PR, -65474 + } + ;; + { .mfb + FADD f8 = f8, f10 + br.ret.sptk.many b0 + } + EPILOGUE From cdbe0f0235b0d23b19daeb40fab98ec83260197c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:20:14 +0100 Subject: [PATCH 520/935] Add MIPS implementation of ?sum as trivial copy of ?asum with the fabs calls removed --- kernel/mips/KERNEL.P5600 | 5 ++++ kernel/mips/sum.c | 47 ++++++++++++++++++++++++++++++++++++ kernel/mips/zsum.c | 52 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 kernel/mips/sum.c create mode 100644 kernel/mips/zsum.c diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 1ab193069..9a6e06d67 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c ISMINKERNEL = ../mips/imin.c IDMINKERNEL = ../mips/imin.c +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + ifdef HAVE_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c diff --git a/kernel/mips/sum.c b/kernel/mips/sum.c new file mode 100644 index 000000000..8ce3812a1 --- /dev/null +++ b/kernel/mips/sum.c @@ -0,0 +1,47 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += x[i]; + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/mips/zsum.c b/kernel/mips/zsum.c new file mode 100644 index 000000000..01f8ced7c --- /dev/null +++ b/kernel/mips/zsum.c @@ -0,0 +1,52 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#define CSUM1(x,i) x[i]+x[i+1] + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CSUM1(x,i); + i += inc_x2; + } + return(sumf); +} + + From 688fa9201c74a8cc1eafd85ebf36cd74f4bf89f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:22:15 +0100 Subject: [PATCH 521/935] Add MIPS64 implementation of ?sum as trivial copy of ?asum with the fabs replaced by mov to preserve code structure --- kernel/mips64/sum.S | 332 +++++++++++++++++++++++++++++++++++++++++++ kernel/mips64/zsum.S | 204 ++++++++++++++++++++++++++ 2 files changed, 536 insertions(+) create mode 100644 kernel/mips64/sum.S create mode 100644 kernel/mips64/zsum.S diff --git a/kernel/mips64/sum.S b/kernel/mips64/sum.S new file mode 100644 index 000000000..261630d49 --- /dev/null +++ b/kernel/mips64/sum.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + li TEMP, SIZE + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + + LD a5, 4 * SIZE(X) + MOV t1, a1 + LD a6, 5 * SIZE(X) + MOV t2, a2 + LD a7, 6 * SIZE(X) + MOV t3, a3 + + MOV t4, a4 + daddiu I, I, -1 + + blez I, .L13 + LD a8, 7 * SIZE(X) + .align 3 + +.L12: + ADD s1, s1, t1 + LD a1, 8 * SIZE(X) + + MOV t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 9 * SIZE(X) + + MOV t2, a6 + NOP + + ADD s1, s1, t3 + LD a3, 10 * SIZE(X) + + MOV t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 11 * SIZE(X) + + MOV t4, a8 + daddiu X, X, 8 * SIZE + + ADD s1, s1, t1 + LD a5, 4 * SIZE(X) + + MOV t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 5 * SIZE(X) + + MOV t2, a2 + NOP + + ADD s1, s1, t3 + LD a7, 6 * SIZE(X) + + MOV t3, a3 + NOP + + ADD s2, s2, t4 + LD a8, 7 * SIZE(X) + + bgtz I, .L12 + MOV t4, a4 + .align 3 + +.L13: + ADD s1, s1, t1 + daddiu X, X, 8 * SIZE + + MOV t1, a5 + NOP + + ADD s2, s2, t2 + MOV t2, a6 + + ADD s1, s1, t3 + MOV t3, a7 + + ADD s2, s2, t4 + MOV t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + MOV t1, a1 + + ADD s1, s1, t1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j .L999 + NOP + .align 3 + +.L20: + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + MOV t1, a1 + LD a7, 0 * SIZE(X) + + MOV t2, a2 + daddu X, X, INCX + + MOV t3, a3 + LD a8, 0 * SIZE(X) + + MOV t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + MOV t1, a5 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a2, 0 * SIZE(X) + + MOV t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + MOV t3, a7 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a4, 0 * SIZE(X) + + MOV t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + MOV t1, a1 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a6, 0 * SIZE(X) + + MOV t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + MOV t3, a3 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a8, 0 * SIZE(X) + + MOV t4, a4 + daddiu I, I, -1 + + bgtz I, .L23 + daddu X, X, INCX + .align 3 + +.L24: + ADD s1, s1, t1 + MOV t1, a5 + + ADD s2, s2, t2 + MOV t2, a6 + + ADD s1, s1, t3 + MOV t3, a7 + + ADD s2, s2, t4 + MOV t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + MOV t1, a1 + daddu X, X, INCX + + bgtz I, .L26 + ADD s1, s1, t1 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/zsum.S b/kernel/mips64/zsum.S new file mode 100644 index 000000000..129b97900 --- /dev/null +++ b/kernel/mips64/zsum.S @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsra I, N, 2 + + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + MOV t1, a1 + MOV t2, a2 + + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + + MOV t3, a3 + MOV t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + MOV t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 1 * SIZE(X) + + MOV t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + MOV t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 1 * SIZE(X) + + MOV t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + MOV t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 1 * SIZE(X) + + MOV t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + MOV t3, a3 + LD a8, 1 * SIZE(X) + + ADD s2, s2, t4 + daddu X, X, INCX + + bgtz I, .L23 + MOV t4, a4 + .align 3 + +.L24: + ADD s1, s1, t1 + MOV t1, a5 + + ADD s2, s2, t2 + MOV t2, a6 + + ADD s1, s1, t3 + MOV t3, a7 + + ADD s2, s2, t4 + MOV t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + MOV t1, a1 + daddiu I, I, -1 + MOV t2, a2 + daddu X, X, INCX + + ADD s1, s1, t1 + bgtz I, .L26 + ADD s2, s2, t2 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE From 706dfe263b7e3fb20dca7c7e9fdab79c9e86cb13 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:23:42 +0100 Subject: [PATCH 522/935] Add POWER implementation of ?sum as trivial copy of ?asum with the fabs replaced by fmr to preserve code structure --- kernel/power/sum.S | 446 +++++++++++++++++++++++++++++++++++++++++++ kernel/power/zsum.S | 452 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 898 insertions(+) create mode 100644 kernel/power/sum.S create mode 100644 kernel/power/zsum.S diff --git a/kernel/power/sum.S b/kernel/power/sum.S new file mode 100644 index 000000000..eda2c5f2c --- /dev/null +++ b/kernel/power/sum.S @@ -0,0 +1,446 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + + FADD f0, f0, f8 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + FADD f0, f0, f8 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zsum.S b/kernel/power/zsum.S new file mode 100644 index 000000000..8396012e8 --- /dev/null +++ b/kernel/power/zsum.S @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCXM1 r9 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + FADD f0, f0, f8 + FADD f1, f1, f9 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + FADD f0, f0, f8 + FADD f1, f1, f9 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE From 70f2a4e0d70609f13c9f35112b90516830c30689 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:25:06 +0100 Subject: [PATCH 523/935] Add SPARC implementation of ?sum as trivial copy of ?asum with the fabs replaced by fmov to preserve code structure --- kernel/sparc/sum.S | 325 +++++++++++++++++++++++++++++++++++++++++++ kernel/sparc/zsum.S | 327 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 652 insertions(+) create mode 100644 kernel/sparc/sum.S create mode 100644 kernel/sparc/zsum.S diff --git a/kernel/sparc/sum.S b/kernel/sparc/sum.S new file mode 100644 index 000000000..f26abb85f --- /dev/null +++ b/kernel/sparc/sum.S @@ -0,0 +1,325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, BASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + cmp INCX, SIZE + bne .LL50 + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 128 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FMOV a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FMOV a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FMOV a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FMOV a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FMOV a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FMOV a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FMOV a1, t1 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FADD c2, t2, c2 + cmp I, 0 + FMOV a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FADD c2, t4, c2 + FMOV a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FMOV a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + + FADD c2, t2, c2 + FMOV a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + + FADD c2, t4, c2 + FMOV a8, t4 + LDF [X + 0 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FADD c1, t1, c1 + add I, -1, I + FMOV a1, t1 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zsum.S b/kernel/sparc/zsum.S new file mode 100644 index 000000000..bc167dc72 --- /dev/null +++ b/kernel/sparc/zsum.S @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, ZBASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + nop + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 32 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FMOV a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FMOV a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FMOV a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FMOV a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FMOV a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FMOV a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FADD c2, t2, c2 + FMOV a1, t1 + FMOV a2, t2 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + cmp I, 0 + FMOV a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a3, t3 + LDF [X + 0 * SIZE], a3 + + FADD c2, t4, c2 + FMOV a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FMOV a5, t1 + LDF [X + 0 * SIZE], a5 + + FADD c2, t2, c2 + FMOV a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 0 * SIZE], a7 + + FADD c2, t4, c2 + FMOV a8, t4 + LDF [X + 1 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + FMOV a1, t1 + FMOV a2, t2 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE From e3bc83f2a8b3304fd1d8107a2f73a672a4ec5ffe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:26:10 +0100 Subject: [PATCH 524/935] Add x86 implementation of ?sum as trivial copy of ?asum with the fabs calls removed --- kernel/x86/sum.S | 207 +++++++++++++++++++++++++++++++++++++++++++++ kernel/x86/zsum.S | 208 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 415 insertions(+) create mode 100644 kernel/x86/sum.S create mode 100644 kernel/x86/zsum.S diff --git a/kernel/x86/sum.S b/kernel/x86/sum.S new file mode 100644 index 000000000..b24f34c8b --- /dev/null +++ b/kernel/x86/sum.S @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $BASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + faddp %st,%st(1) + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addl INCX, X + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zsum.S b/kernel/x86/zsum.S new file mode 100644 index 000000000..cd2ce61db --- /dev/null +++ b/kernel/x86/zsum.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpl $SIZE * 2, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + faddp %st,%st(3) + faddp %st,%st(1) + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + faddp %st,%st(3) + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE From 9d717cb5ee817f87a1306d64da75a09375abd407 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:27:04 +0100 Subject: [PATCH 525/935] Add x86_64 implementation of ?sum as trivial copy of ?asum with the fabs calls removed --- kernel/x86_64/sum.S | 179 ++++++++++++++++++++++++++++++++++++++++++ kernel/x86_64/zsum.S | 180 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 359 insertions(+) create mode 100644 kernel/x86_64/sum.S create mode 100644 kernel/x86_64/zsum.S diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S new file mode 100644 index 000000000..d075eaa04 --- /dev/null +++ b/kernel/x86_64/sum.S @@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $BASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $7, M + jle .L998 + ALIGN_4 + +.L21: + FLD (X) + faddp %st,%st(1) + addq $1 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $7, M + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addq INCX, X + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S new file mode 100644 index 000000000..45e0ddff5 --- /dev/null +++ b/kernel/x86_64/zsum.S @@ -0,0 +1,180 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE * 2, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $3, M + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + faddp %st,%st(3) + faddp %st,%st(1) + addq $2 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + faddp %st,%st(3) + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE From 246ca29679c5e74d2f306e39eefd1939aa6c37bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:49:05 +0100 Subject: [PATCH 526/935] Add ZARCH implementation of ?sum as trivial copies of the respective ?asum kernels with the ABS and vflpsb calls removed --- kernel/zarch/KERNEL.Z13 | 5 + kernel/zarch/KERNEL.Z14 | 5 + kernel/zarch/KERNEL.ZARCH_GENERIC | 5 + kernel/zarch/csum.c | 137 +++++++++++++++++++++++++++ kernel/zarch/dsum.c | 148 +++++++++++++++++++++++++++++ kernel/zarch/ssum.c | 151 ++++++++++++++++++++++++++++++ kernel/zarch/zsum.c | 136 +++++++++++++++++++++++++++ 7 files changed, 587 insertions(+) create mode 100644 kernel/zarch/csum.c create mode 100644 kernel/zarch/dsum.c create mode 100644 kernel/zarch/ssum.c create mode 100644 kernel/zarch/zsum.c diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index 22c7e9703..b1ffd3c54 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c +SSUMKERNEL = ../arm/asum.c +DSUMKERNEL = dasum.c +CSUMKERNEL = ../arm/zasum.c +ZSUMKERNEL = zasum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index 80f78f48f..971896c2d 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c +SSUMKERNEL = ssum.c +DSUMKERNEL = dsum.c +CSUMKERNEL = csum.c +ZSUMKERNEL = zsum.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC index 848ee9b54..3bbeb9155 100644 --- a/kernel/zarch/KERNEL.ZARCH_GENERIC +++ b/kernel/zarch/KERNEL.ZARCH_GENERIC @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c new file mode 100644 index 000000000..c0b8c6371 --- /dev/null +++ b/kernel/zarch/csum.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (sumf); + + if (inc_x == 1) { + + n1 = n & -32; + if (n1 > 0) { + + sumf = csum_kernel_32(n1, x); + i = n1; + ip = 2 * n1; + } + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + i++; + ip += 2; + } + + } else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + + } + return (sumf); +} diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c new file mode 100644 index 000000000..178bc3462 --- /dev/null +++ b/kernel/zarch/dsum.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) + return sumf; + + if (inc_x == 1) { + + n1 = n & -32; + + if (n1 > 0) { + + sumf = dsum_kernel_32(n1, x); + i = n1; + } + + while (i < n) { + sumf += x[i]; + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += x[i]; + sum2 += x[i + inc_x]; + sum1 += x[i + 2 * inc_x]; + sum2 += x[i + 3 * inc_x]; + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += x[i]; + i += inc_x; + j++; + } + + } + return sumf; +} diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c new file mode 100644 index 000000000..a433ab592 --- /dev/null +++ b/kernel/zarch/ssum.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + +static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) + return sumf; + + if (inc_x == 1) { + + n1 = n & -64; + + if (n1 > 0) { + + sumf = ssum_kernel_64(n1, x); + i = n1; + } + + while (i < n) { + sumf += x[i]; + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += x[i]; + sum2 += x[i + inc_x]; + sum1 += x[i + 2 * inc_x]; + sum2 += x[i + 3 * inc_x]; + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += x[i]; + i += inc_x; + j++; + } + + } + return sumf; +} diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c new file mode 100644 index 000000000..7cfc1f17f --- /dev/null +++ b/kernel/zarch/zsum.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + +static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (sumf); + + if (inc_x == 1) { + + n1 = n & -16; + if (n1 > 0) { + + sumf = zsum_kernel_16(n1, x); + i = n1; + ip = 2 * n1; + } + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + i++; + ip += 2; + } + + } else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + + } + return (sumf); +} From 1679de5e5968bdeffd63793bed55048088216c18 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 10:50:43 +0200 Subject: [PATCH 527/935] Detect 32bit environment on 64bit ARM hardware for #2056, using same approach as #2058 --- cmake/system_check.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index f30a946b4..94d3ba643 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -49,7 +49,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") set(ARM 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") - set(ARM64 1) + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + set(ARM64 1) + else() + set(ARM 1) + endif() endif() if (X86_64) From d17da6c6a44bcf94a1e677642288261f7a1848d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 11:57:01 +0200 Subject: [PATCH 528/935] Add cmake defaults for ?sum kernels --- cmake/kernel.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index fad84de51..0ed09e776 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -107,6 +107,12 @@ macro(SetDefaultL1) set(DAXPBYKERNEL ../arm/axpby.c) set(CAXPBYKERNEL ../arm/zaxpby.c) set(ZAXPBYKERNEL ../arm/zaxpby.c) + set(SSUMKERNEL sum.S) + set(DSUMKERNEL sum.S) + set(CSUMKERNEL zsum.S) + set(ZSUMKERNEL zsum.S) + set(QSUMKERNEL sum.S) + set(XSUMKERNEL zsum.S) endmacro () macro(SetDefaultL2) @@ -162,4 +168,4 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) -endmacro () \ No newline at end of file +endmacro () From 100d94f94edca3274f75658198dada784dd18daa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 13:55:05 +0200 Subject: [PATCH 529/935] Add ?sum --- kernel/x86/KERNEL.generic | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86/KERNEL.generic b/kernel/x86/KERNEL.generic index 672edb069..0aac0ce99 100644 --- a/kernel/x86/KERNEL.generic +++ b/kernel/x86/KERNEL.generic @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c From c04a7290812f79972b7dbe92be4ccd6e879e88d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 13:55:49 +0200 Subject: [PATCH 530/935] Add ?sum definitions for generic kernel --- kernel/x86_64/KERNEL.generic | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index a23e59f3f..7cb0cb836 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c From 21d146a8de232a2774d706c5725586dca3d39c02 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 22:12:23 +0200 Subject: [PATCH 531/935] Add declarations for ?sum --- common_q.h | 2 ++ common_x.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/common_q.h b/common_q.h index 30ad3727a..b4ace3a62 100644 --- a/common_q.h +++ b/common_q.h @@ -19,6 +19,7 @@ #define QDOTC_K qdot_k #define QNRM2_K qnrm2_k #define QSCAL_K qscal_k +#define QSUM_K qsum_k #define QSWAP_K qswap_k #define QROT_K qrot_k @@ -161,6 +162,7 @@ #define QDOTC_K gotoblas -> qdot_k #define QNRM2_K gotoblas -> qnrm2_k #define QSCAL_K gotoblas -> qscal_k +#define QSUM_K gotoblas -> qsum_k #define QSWAP_K gotoblas -> qswap_k #define QROT_K gotoblas -> qrot_k diff --git a/common_x.h b/common_x.h index 03b98db4f..2ed525faa 100644 --- a/common_x.h +++ b/common_x.h @@ -19,6 +19,7 @@ #define XDOTC_K xdotc_k #define XNRM2_K xnrm2_k #define XSCAL_K xscal_k +#define XSUM_K xsum_k #define XSWAP_K xswap_k #define XROT_K xqrot_k @@ -227,6 +228,7 @@ #define XDOTC_K gotoblas -> xdotc_k #define XNRM2_K gotoblas -> xnrm2_k #define XSCAL_K gotoblas -> xscal_k +#define XSUM_K gotoblas -> xsum_k #define XSWAP_K gotoblas -> xswap_k #define XROT_K gotoblas -> xqrot_k From 9229d6859b5f4b185315048ccc58644c9112bdd5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Apr 2019 09:38:18 +0200 Subject: [PATCH 532/935] Add -lm and disable EXPRECISION support on *BSD fixes #2075 --- cmake/os.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/os.cmake b/cmake/os.cmake index 1321ef619..2d25e7aaa 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(NO_EXPRECISION 1) endif () +if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") + set(EXTRALIB "${EXTRALIB} -lm") + set(NO_EXPRECISION 1) +endif () + if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") set(EXTRALIB "${EXTRALIB} -lm") endif () From bcdf1d49170508fd5c8250f802dd9018b7771534 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Tue, 9 Apr 2019 14:13:24 +1000 Subject: [PATCH 533/935] Add in runtime CPU detection for POWER. --- Makefile.system | 6 ++ driver/others/Makefile | 8 +++ driver/others/dynamic_power.c | 102 ++++++++++++++++++++++++++++++++++ kernel/power/KERNEL.POWER8 | 32 +++++------ kernel/power/KERNEL.POWER9 | 32 +++++------ kernel/setparam-ref.c | 22 ++++++++ 6 files changed, 170 insertions(+), 32 deletions(-) create mode 100644 driver/others/dynamic_power.c diff --git a/Makefile.system b/Makefile.system index 53f89b2fa..a95d6190f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -528,6 +528,12 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 endif +ifeq ($(ARCH), power) +DYNAMIC_CORE = POWER6 +DYNAMIC_CORE += POWER8 +DYNAMIC_CORE += POWER9 +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= diff --git a/driver/others/Makefile b/driver/others/Makefile index 3dc2e7c1b..d4b5c26d5 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -18,8 +18,12 @@ ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH),arm64) COMMONOBJS += dynamic_arm64.$(SUFFIX) else +ifeq ($(ARCH),power) +COMMONOBJS += dynamic_power.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -78,8 +82,12 @@ ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH),arm64) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) else +ifeq ($(ARCH),power) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c new file mode 100644 index 000000000..0c4a87a5e --- /dev/null +++ b/driver/others/dynamic_power.c @@ -0,0 +1,102 @@ + +#include "common.h" + +extern gotoblas_t gotoblas_POWER6; +extern gotoblas_t gotoblas_POWER8; +extern gotoblas_t gotoblas_POWER9; + +extern void openblas_warning(int verbose, const char *msg); + +static char *corename[] = { + "unknown", + "POWER6", + "POWER8", + "POWER9" +}; + +#define NUM_CORETYPES 4 + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_POWER6) return corename[1]; + if (gotoblas == &gotoblas_POWER8) return corename[2]; + if (gotoblas == &gotoblas_POWER9) return corename[3]; + return corename[0]; +} + +static gotoblas_t *get_coretype(void) { + + if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) + return &gotoblas_POWER6; + if (__builtin_cpu_is("power8")) + return &gotoblas_POWER8; + if (__builtin_cpu_is("power9")) + return &gotoblas_POWER9; + return NULL; +} + +static gotoblas_t *force_coretype(char * coretype) { + + int i ; + int found = -1; + char message[128]; + + for ( i = 0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 1: return (&gotoblas_POWER6); + case 2: return (&gotoblas_POWER8); + case 3: return (&gotoblas_POWER9); + default: return NULL; + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to POWER8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_POWER8; + } + + if (gotoblas && gotoblas -> init) { + strncpy(coren,gotoblas_corename(),20); + sprintf(coremsg, "Core: %s\n",coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index e6f69c7c4..43f004fbb 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMITCOPY = zgemm_tcopy_8_power8.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 86a931971..e166f252f 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power9.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMITCOPY = zgemm_tcopy_8_power8.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 6d4028b0b..b964a8bad 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -718,6 +718,27 @@ static void init_parameter(void) { } #else // defined(ARCH_ARM64) +#if defined(ARCH_POWER) +static void init_parameter(void) { + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +} +#else //POWER + #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1303,4 +1324,5 @@ static void init_parameter(void) { } +#endif //POWER #endif //defined(ARCH_ARM64) From 40e53e52d645d1cbef76c8432847fa3c219b9dd2 Mon Sep 17 00:00:00 2001 From: Jeff Baylor Date: Mon, 22 Apr 2019 17:01:34 -0700 Subject: [PATCH 534/935] snprintf define consolidated to common.h --- common.h | 2 ++ driver/others/openblas_get_config.c | 6 ------ utest/ctest.h | 4 ---- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/common.h b/common.h index 239b2a850..0ac74bb20 100644 --- a/common.h +++ b/common.h @@ -85,6 +85,8 @@ extern "C" { #if !defined(_MSC_VER) #include +#elif _MSC_VER < 1900 +#define snprintf _snprintf #endif #include diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index eca494dca..81648fb7c 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -35,12 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(_WIN32) && defined(_MSC_VER) -#if _MSC_VER < 1900 -#define snprintf _snprintf -#endif -#endif - static char* openblas_config_str="" "OpenBLAS " VERSION diff --git a/utest/ctest.h b/utest/ctest.h index f297dafba..d316b1494 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -83,10 +83,6 @@ struct ctest { #undef CTEST_SEGFAULT #endif -#if _MSC_VER < 1900 -#define snprintf _snprintf -#endif - #ifndef __cplusplus #define inline __inline #endif From 9a19616a282d0c01d6695c7419dff01895d25d73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 18:55:47 +0200 Subject: [PATCH 535/935] Support INTERFACE64=1 --- relapack/inc/relapack.h | 116 ++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h index e421f352b..7f283e04d 100644 --- a/relapack/inc/relapack.h +++ b/relapack/inc/relapack.h @@ -1,67 +1,79 @@ #ifndef RELAPACK_H #define RELAPACK_H -void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); -void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); -void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); -void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); +#ifdef USE64BITINT + typedef BLASLONG blasint; + #if defined(OS_WINDOWS) && defined(__64BIT__) + #define blasabs(x) llabs(x) + #else + #define blasabs(x) labs(x) + #endif +#else + typedef int blasint; + #define blasabs(x) abs(x) +#endif -void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); -void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); -void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); -void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); +void RELAPACK_slauum(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dlauum(const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_clauum(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_zlauum(const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); -void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); -void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); -void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); +void RELAPACK_strtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dtrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_ctrtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_ztrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); -void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); -void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); -void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); +void RELAPACK_spotrf(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dpotrf(const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_cpotrf(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_zpotrf(const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_spbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_cpbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_zpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); -void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_ssytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_ssytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); -void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_sgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_dgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_cgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_zgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); -void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +void RELAPACK_sgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_dgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_cgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_zgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); -void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ssygst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +void RELAPACK_dsygst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); +void RELAPACK_chegst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +void RELAPACK_zhegst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); -void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); -void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); -void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); -void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); +void RELAPACK_strsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_dtrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); +void RELAPACK_ctrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_ztrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); -void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +void RELAPACK_stgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_dtgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_ctgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_ztgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *); + +void RELAPACK_sgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +void RELAPACK_dgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +void RELAPACK_cgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +void RELAPACK_zgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); #endif /* RELAPACK_H */ From 798c448b0c9ed1d0546f3d660a26f66d6a852283 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 19:06:00 +0200 Subject: [PATCH 536/935] Add support for INTERFACE64 and fix XERBLA calls 1. Replaced all instances of "int" with "blasint" 2. Added string length as "hidden" third parameter in calls to fortran XERBLA --- relapack/src/blas.h | 106 +++++----- relapack/src/cgbtrf.c | 78 +++---- relapack/src/cgemmt.c | 66 +++--- relapack/src/cgetrf.c | 40 ++-- relapack/src/chegst.c | 38 ++-- relapack/src/chetrf.c | 72 +++---- relapack/src/chetrf_rec2.c | 32 +-- relapack/src/chetrf_rook.c | 72 +++---- relapack/src/chetrf_rook_rec2.c | 40 ++-- relapack/src/clauum.c | 28 +-- relapack/src/cpbtrf.c | 42 ++-- relapack/src/cpotrf.c | 28 +-- relapack/src/csytrf.c | 72 +++---- relapack/src/csytrf_rec2.c | 32 +-- relapack/src/csytrf_rook.c | 72 +++---- relapack/src/csytrf_rook_rec2.c | 40 ++-- relapack/src/ctgsyl.c | 62 +++--- relapack/src/ctrsyl.c | 52 ++--- relapack/src/ctrsyl_rec2.c | 36 ++-- relapack/src/ctrtri.c | 34 +-- relapack/src/dgbtrf.c | 80 +++---- relapack/src/dgemmt.c | 62 +++--- relapack/src/dgetrf.c | 40 ++-- relapack/src/dlauum.c | 28 +-- relapack/src/dpbtrf.c | 42 ++-- relapack/src/dpotrf.c | 28 +-- relapack/src/dsygst.c | 40 ++-- relapack/src/dsytrf.c | 72 +++---- relapack/src/dsytrf_rec2.c | 32 +-- relapack/src/dsytrf_rook.c | 72 +++---- relapack/src/dsytrf_rook_rec2.c | 38 ++-- relapack/src/dtgsyl.c | 66 +++--- relapack/src/dtrsyl.c | 56 ++--- relapack/src/dtrsyl_rec2.c | 58 ++--- relapack/src/dtrtri.c | 34 +-- relapack/src/f2c.c | 2 +- relapack/src/f2c.h | 13 ++ relapack/src/lapack.h | 124 +++++------ relapack/src/lapack_wrappers.c | 360 ++++++++++++++++---------------- relapack/src/relapack.h | 42 ++-- relapack/src/sgbtrf.c | 79 ++++--- relapack/src/sgemmt.c | 62 +++--- relapack/src/sgetrf.c | 40 ++-- relapack/src/slauum.c | 28 +-- relapack/src/spbtrf.c | 42 ++-- relapack/src/spotrf.c | 28 +-- relapack/src/ssygst.c | 38 ++-- relapack/src/ssytrf.c | 73 ++++--- relapack/src/ssytrf_rec2.c | 28 +-- relapack/src/ssytrf_rook.c | 72 +++---- relapack/src/ssytrf_rook_rec2.c | 32 +-- relapack/src/stgsyl.c | 66 +++--- relapack/src/strsyl.c | 56 ++--- relapack/src/strsyl_rec2.c | 50 ++--- relapack/src/strtri.c | 34 +-- relapack/src/zgbtrf.c | 78 +++---- relapack/src/zgemmt.c | 66 +++--- relapack/src/zgetrf.c | 40 ++-- relapack/src/zhegst.c | 38 ++-- relapack/src/zhetrf.c | 72 +++---- relapack/src/zhetrf_rec2.c | 36 ++-- relapack/src/zhetrf_rook.c | 72 +++---- relapack/src/zhetrf_rook_rec2.c | 38 ++-- relapack/src/zlauum.c | 28 +-- relapack/src/zpbtrf.c | 42 ++-- relapack/src/zpotrf.c | 28 +-- relapack/src/zsytrf.c | 72 +++---- relapack/src/zsytrf_rec2.c | 34 +-- relapack/src/zsytrf_rook.c | 72 +++---- relapack/src/zsytrf_rook_rec2.c | 36 ++-- relapack/src/ztgsyl.c | 62 +++--- relapack/src/ztrsyl.c | 52 ++--- relapack/src/ztrsyl_rec2.c | 42 ++-- relapack/src/ztrtri.c | 34 +-- 74 files changed, 2010 insertions(+), 1991 deletions(-) diff --git a/relapack/src/blas.h b/relapack/src/blas.h index 7441c1033..6d9f1a42a 100644 --- a/relapack/src/blas.h +++ b/relapack/src/blas.h @@ -1,61 +1,61 @@ #ifndef BLAS_H #define BLAS_H -extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); -extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); -extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); -extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); - -extern void BLAS(sscal)(const int *, const float *, float *, const int *); -extern void BLAS(dscal)(const int *, const double *, double *, const int *); -extern void BLAS(cscal)(const int *, const float *, float *, const int *); -extern void BLAS(zscal)(const int *, const double *, double *, const int *); - -extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); -extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); - -extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); -extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); - -extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); -extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); - -extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); -extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); - -extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); -extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); - -extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); -extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); -extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); -extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); - -extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); - -extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(sswap)(const blasint *, float *, const blasint *, float *, const blasint *); +extern void BLAS(dswap)(const blasint *, double *, const blasint *, double *, const blasint *); +extern void BLAS(cswap)(const blasint *, float *, const blasint *, float *, const blasint *); +extern void BLAS(zswap)(const blasint *, double *, const blasint *, double *, const blasint *); + +extern void BLAS(sscal)(const blasint *, const float *, float *, const blasint *); +extern void BLAS(dscal)(const blasint *, const double *, double *, const blasint *); +extern void BLAS(cscal)(const blasint *, const float *, float *, const blasint *); +extern void BLAS(zscal)(const blasint *, const double *, double *, const blasint *); + +extern void BLAS(saxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(daxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *); +extern void BLAS(caxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(zaxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *); + +extern void BLAS(sgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(dgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); +extern void BLAS(cgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(zgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); + +extern void BLAS(sgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(dgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); +extern void BLAS(cgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(zgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); + +extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); +extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); + +extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); +extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); + +extern void BLAS(ssyrk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(dsyrk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *); +extern void BLAS(cherk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(zherk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *); + +extern void BLAS(ssymm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(dsymm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +extern void BLAS(chemm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(zhemm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); + +extern void BLAS(ssyr2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(dsyr2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +extern void BLAS(cher2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(zher2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); #if HAVE_XGEMMT -extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); -extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(sgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(dgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); +extern void BLAS(cgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(zgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); #endif #endif /* BLAS_H */ diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index 90b2c8789..eddfdedf7 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, - const int *, float *, const int *, int *, float *, const int *, float *, - const int *, int *); +static void RELAPACK_cgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *, + const blasint *, blasint *); /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +13,9 @@ static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html * */ void RELAPACK_cgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { // Check arguments @@ -31,8 +31,8 @@ void RELAPACK_cgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CGBTRF", &minfo, strlen("CGBTRF")); return; } @@ -40,14 +40,14 @@ void RELAPACK_cgbtrf( const float ZERO[] = { 0., 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { float *const A_j = A + 2 * *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +55,11 @@ void RELAPACK_cgbtrf( } // Allocate work space - const int n1 = CREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = CREC_SPLIT(*n); + const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const blasint nWorkl = (kv > n1) ? n1 : kv; + const blasint mWorku = (*kl > n1) ? n1 : *kl; + const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_cgbtrf( /** cgbtrf's recursive compute kernel */ static void RELAPACK_cgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_cgbtrf_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * kv; // Splitting - const int n1 = MIN(CREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(CREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_cgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_cgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_cgbtrf_rec( for (j = 0; j < n22; j++) { float *const A_Rrj = A_Rr + 2 * *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const float tmpr = A_Rrj[2 * i]; const float tmpc = A_Rrj[2 * i + 1]; @@ -211,7 +211,7 @@ static void RELAPACK_cgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c index 28e2b00b0..3af4d790f 100644 --- a/relapack/src/cgemmt.c +++ b/relapack/src/cgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); /** CGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_cgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,15 +32,15 @@ void RELAPACK_cgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int ctransA = LAPACK(lsame)(transA, "C"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - const int ctransB = LAPACK(lsame)(transB, "C"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint ctransA = LAPACK(lsame)(transA, "C"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + const blasint ctransB = LAPACK(lsame)(transB, "C"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !ctransA && !notransA) @@ -58,7 +58,7 @@ void RELAPACK_cgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("CGEMMT", &info); + LAPACK(xerbla)("CGEMMT", &info, strlen("CGEMMT")); return; } @@ -76,10 +76,10 @@ void RELAPACK_cgemmt( /** cgemmt's recursive compute kernel */ static void RELAPACK_cgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { @@ -89,8 +89,8 @@ static void RELAPACK_cgemmt_rec( } // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -126,16 +126,16 @@ static void RELAPACK_cgemmt_rec( /** cgemmt's unblocked compute kernel */ static void RELAPACK_cgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -151,13 +151,13 @@ static void RELAPACK_cgemmt_rec2( float *const C_ii = C + 2 * *ldC * i + 2 * i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index b31a711d0..9aab718a0 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_cgetrf_rec(const int *, const int *, float *, - const int *, int *, int *); +static void RELAPACK_cgetrf_rec(const blasint *, const blasint *, float *, + const blasint *, blasint *, blasint *); /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_cgetrf_rec(const int *, const int *, float *, * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html */ void RELAPACK_cgetrf( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_cgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CGETRF", &minfo, strlen("CGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_cgetrf( if (*m < *n) { // Constants const float ONE[] = { 1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const float *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_cgetrf( /** cgetrf's recursive compute kernel */ static void RELAPACK_cgetrf_rec( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_CGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_cgetrf_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R float *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_cgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_cgetrf_rec( // apply pivots to A_BL LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c index dff875017..fe77b03ea 100644 --- a/relapack/src/chegst.c +++ b/relapack/src/chegst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_chegst_rec(const int *, const char *, const int *, - float *, const int *, const float *, const int *, - float *, const int *, int *); +static void RELAPACK_chegst_rec(const blasint *, const char *, const blasint *, + float *, const blasint *, const float *, const blasint *, + float *, const blasint *, blasint *); /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_chegst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html * */ void RELAPACK_chegst( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_chegst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CHEGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CHEGST", &minfo, strlen("CHEGST")); return; } @@ -45,9 +45,9 @@ void RELAPACK_chegst( // Allocate work space float *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = CREC_SPLIT(*n); + const blasint n1 = CREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * 2 * sizeof(float)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_chegst( /** chegst's recursive compute kernel */ static void RELAPACK_chegst_rec( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - float *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_CHEGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_chegst_rec( const float MONE[] = { -1., 0. }; const float HALF[] = { .5, 0. }; const float MHALF[] = { -.5, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c index 2928235e4..8cd3c0774 100644 --- a/relapack/src/chetrf.c +++ b/relapack/src/chetrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_chetrf_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html * */ void RELAPACK_chetrf( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_chetrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_chetrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_chetrf( /** chetrf's recursive compute kernel */ static void RELAPACK_chetrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c index b5c8341b6..412f64cf7 100644 --- a/relapack/src/chetrf_rec2.c +++ b/relapack/src/chetrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * - nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, - int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w, + int *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2, r__3, r__4; complex q__1, q__2, q__3, q__4; @@ -38,22 +38,22 @@ static int c__1 = 1; void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k; + static blasint j, k; static float t, r1; static complex d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen), ccopy_(int *, complex *, int *, - complex *, int *), cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen), ccopy_(int *, complex *, blasint *, + complex *, blasint *), cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float absakk; - extern /* Subroutine */ int clacgv_(int *, complex *, int *); - extern int icamax_(int *, complex *, int *); - extern /* Subroutine */ int csscal_(int *, float *, complex *, int + extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *); + extern blasint icamax_(int *, complex *, blasint *); + extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int *); static float colmax, rowmax; diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c index 086393d57..3d2fa3216 100644 --- a/relapack/src/chetrf_rook.c +++ b/relapack/src/chetrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_chetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html * */ void RELAPACK_chetrf_rook( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_chetrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_chetrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_chetrf_rook( /** chetrf_rook's recursive compute kernel */ static void RELAPACK_chetrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c index a42cbfd44..e0b2ff962 100644 --- a/relapack/src/chetrf_rook_rec2.c +++ b/relapack/src/chetrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, complex *a, int *lda, int *ipiv, - complex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, + complex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4, q__5; @@ -38,29 +38,29 @@ static int c__1 = 1; void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static float t, r1; static complex d11, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen); + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen); static float sfmin; - extern /* Subroutine */ int ccopy_(int *, complex *, int *, - complex *, int *); - static int itemp; - extern /* Subroutine */ int cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *, + complex *, blasint *); + static blasint itemp; + extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float stemp, absakk; - extern /* Subroutine */ int clacgv_(int *, complex *, int *); - extern int icamax_(int *, complex *, int *); + extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *); + extern blasint icamax_(int *, complex *, blasint *); extern double slamch_(char *, ftnlen); - extern /* Subroutine */ int csscal_(int *, float *, complex *, int + extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int *); static float colmax, rowmax; diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c index 36d6297cf..2bc93f182 100644 --- a/relapack/src/clauum.c +++ b/relapack/src/clauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_clauum_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_clauum_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_clauum_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html * */ void RELAPACK_clauum( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_clauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CLAUUM", &minfo, strlen("CLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_clauum( /** clauum's recursive compute kernel */ static void RELAPACK_clauum_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_clauum_rec( const float ONE[] = { 1., 0. }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c index e0ea7b944..971e547c6 100644 --- a/relapack/src/cpbtrf.c +++ b/relapack/src/cpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, - float *, const int *, float *, const int *, int *); +static void RELAPACK_cpbtrf_rec(const char *, const blasint *, const blasint *, + float *, const blasint *, float *, const blasint *, blasint *); /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html * */ void RELAPACK_cpbtrf( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_cpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CPBTRF", &minfo, strlen("CPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_cpbtrf( const float ZERO[] = { 0., 0. }; // Allocate work space - const int n1 = CREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = CREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; float *Work = malloc(mWork * nWork * 2 * sizeof(float)); LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_cpbtrf( /** cpbtrf's recursive compute kernel */ static void RELAPACK_cpbtrf_rec( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - float *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + float *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_cpbtrf_rec( const float MONE[] = { -1., 0. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(CREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(CREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_cpbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, *kd); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c index e35caa7fa..0f8e7ebb0 100644 --- a/relapack/src/cpotrf.c +++ b/relapack/src/cpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_cpotrf_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_cpotrf_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_cpotrf_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html * */ void RELAPACK_cpotrf( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_cpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CPOTRF", &minfo, strlen("CPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_cpotrf( /** cpotrf's recursive compute kernel */ static void RELAPACK_cpotrf_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_cpotrf_rec( const float MONE[] = { -1., 0. }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c index 01c161d1a..2ebc31001 100644 --- a/relapack/src/csytrf.c +++ b/relapack/src/csytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_csytrf_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html * */ void RELAPACK_csytrf( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_csytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_csytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_csytrf( /** csytrf's recursive compute kernel */ static void RELAPACK_csytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_csytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_csytrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_csytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_csytrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c index 9d6bd849d..216a9e248 100644 --- a/relapack/src/csytrf_rec2.c +++ b/relapack/src/csytrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * - nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, - int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w, + int *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2, r__3, r__4; complex q__1, q__2, q__3; @@ -38,21 +38,21 @@ static int c__1 = 1; void c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k; + static blasint j, k; static complex t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; - extern /* Subroutine */ int cscal_(int *, complex *, complex *, - int *); + extern /* Subroutine */ blasint cscal_(int *, complex *, complex *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen), ccopy_(int *, complex *, int *, - complex *, int *), cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen), ccopy_(int *, complex *, blasint *, + complex *, blasint *), cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float absakk; - extern int icamax_(int *, complex *, int *); + extern blasint icamax_(int *, complex *, blasint *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c index aa7dd0e57..e8a9865cc 100644 --- a/relapack/src/csytrf_rook.c +++ b/relapack/src/csytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_csytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html * */ void RELAPACK_csytrf_rook( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_csytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_csytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_csytrf_rook( /** csytrf_rook's recursive compute kernel */ static void RELAPACK_csytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_csytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_csytrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_csytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_csytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_csytrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c index 6638338a6..2561065d7 100644 --- a/relapack/src/csytrf_rook_rec2.c +++ b/relapack/src/csytrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, complex *a, int *lda, int *ipiv, - complex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, + complex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4; @@ -38,27 +38,27 @@ static int c__1 = 1; void c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static complex t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static float alpha; - extern /* Subroutine */ int cscal_(int *, complex *, complex *, - int *); + extern /* Subroutine */ blasint cscal_(int *, complex *, complex *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen); + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen); static float sfmin; - extern /* Subroutine */ int ccopy_(int *, complex *, int *, - complex *, int *); - static int itemp; - extern /* Subroutine */ int cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *, + complex *, blasint *); + static blasint itemp; + extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float stemp, absakk; - extern int icamax_(int *, complex *, int *); + extern blasint icamax_(int *, complex *, blasint *); extern double slamch_(char *, ftnlen); static float colmax, rowmax; diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c index 15c738baf..704f3ef23 100644 --- a/relapack/src/ctgsyl.c +++ b/relapack/src/ctgsyl.c @@ -1,10 +1,10 @@ #include "relapack.h" #include -static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, - const int *, const float *, const int *, const float *, const int *, - float *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, float *, float *, int *); +static void RELAPACK_ctgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const float *, const blasint *, const float *, const blasint *, + float *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, float *, float *, blasint *); /** CTGSYL solves the generalized Sylvester equation. @@ -14,21 +14,21 @@ static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html * */ void RELAPACK_ctgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "C"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "C"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -57,8 +57,8 @@ void RELAPACK_ctgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CTGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CTGSYL", &minfo, strlen("CTGSYL")); return; } @@ -74,8 +74,8 @@ void RELAPACK_ctgsyl( // Constant const float ZERO[] = { 0., 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -86,7 +86,7 @@ void RELAPACK_ctgsyl( } float scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; float dscale = 0; @@ -119,13 +119,13 @@ void RELAPACK_ctgsyl( /** ctgsyl's recursive vompute kernel */ static void RELAPACK_ctgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dsum, float *dscale, - int *info + blasint *info ) { if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { @@ -137,18 +137,18 @@ static void RELAPACK_ctgsyl_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1., 0. }; float scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = CREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = CREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -206,8 +206,8 @@ static void RELAPACK_ctgsyl_rec( } } else { // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c index b548d5354..fed6e847e 100644 --- a/relapack/src/ctrsyl.c +++ b/relapack/src/ctrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, - const int *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, int *); +static void RELAPACK_ctrsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, blasint *); /** CTRSYL solves the complex Sylvester matrix equation. @@ -12,18 +12,18 @@ static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html * */ void RELAPACK_ctrsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!ctransA && !notransA) *info = -1; @@ -42,8 +42,8 @@ void RELAPACK_ctrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CTRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CTRSYL", &minfo, strlen("CTRSYL")); return; } @@ -58,11 +58,11 @@ void RELAPACK_ctrsyl( /** ctrsyl's recursive compute kernel */ static void RELAPACK_ctrsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { @@ -75,18 +75,18 @@ static void RELAPACK_ctrsyl_rec( const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; const float MSGN[] = { -*isgn, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1., 0. }; float scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = CREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = CREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -122,8 +122,8 @@ static void RELAPACK_ctrsyl_rec( } } else { // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c index 518574868..556491c7a 100644 --- a/relapack/src/ctrsyl_rec2.c +++ b/relapack/src/ctrsyl_rec2.c @@ -14,16 +14,16 @@ #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES -complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { - extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); +complex cdotu_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) { + extern void cdotu_(complex *, blasint *, complex *, blasint *, complex *, blasint *); complex result; cdotu_(&result, n, x, incx, y, incy); return result; } #define cdotu_ cdotu_fun -complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { - extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); +complex cdotc_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) { + extern void cdotc_(complex *, blasint *, complex *, blasint *, complex *, blasint *); complex result; cdotc_(&result, n, x, incx, y, incy); return result; @@ -43,7 +43,7 @@ complex cladiv_fun(complex *a, complex *b) { /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) * @@ -51,12 +51,12 @@ static int c__1 = 1; * It serves as an unblocked kernel in the recursive algorithms. * */ /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int - *isgn, int *m, int *n, complex *a, int *lda, complex *b, - int *ldb, complex *c__, int *ldc, float *scale, int *info, + *isgn, blasint *m, blasint *n, complex *a, blasint *lda, complex *b, + int *ldb, complex *c__, blasint *ldc, float *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4; @@ -66,7 +66,7 @@ static int c__1 = 1; void r_cnjg(complex *, complex *); /* Local variables */ - static int j, k, l; + static blasint j, k, l; static complex a11; static float db; static complex x11; @@ -75,20 +75,20 @@ static int c__1 = 1; static float dum[1], eps, sgn, smin; static complex suml, sumr; /* Complex */ complex cdotc_(int *, complex *, int - *, complex *, int *); - extern int lsame_(char *, char *, ftnlen, ftnlen); + *, complex *, blasint *); + extern blasint lsame_(char *, char *, ftnlen, ftnlen); /* Complex */ complex cdotu_(int *, complex *, int - *, complex *, int *); - extern /* Subroutine */ int slabad_(float *, float *); - extern float clange_(char *, int *, int *, complex *, - int *, float *, ftnlen); + *, complex *, blasint *); + extern /* Subroutine */ blasint slabad_(float *, float *); + extern float clange_(char *, blasint *, blasint *, complex *, + blasint *, float *, ftnlen); /* Complex */ complex cladiv_(complex *, complex *); static float scaloc; extern float slamch_(char *, ftnlen); - extern /* Subroutine */ int csscal_(int *, float *, complex *, int - *), xerbla_(char *, int *, ftnlen); + extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int + *), xerbla_(char *, blasint *, ftnlen); static float bignum; - static int notrna, notrnb; + static blasint notrna, notrnb; static float smlnum; /* Parameter adjustments */ diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c index 0262cb59d..5201a24c7 100644 --- a/relapack/src/ctrtri.c +++ b/relapack/src/ctrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, - float *, const int *, int *); +static void RELAPACK_ctrtri_rec(const char *, const char *, const blasint *, + float *, const blasint *, blasint *); /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html * */ void RELAPACK_ctrtri( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_ctrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CTRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CTRTRI", &minfo, strlen("CTRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_ctrtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_ctrtri( /** ctrtri's recursive compute kernel */ static void RELAPACK_ctrtri_rec( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_ctrtri_rec( const float MONE[] = { -1., 0. }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index 1a1757d31..f4b443629 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -1,9 +1,8 @@ #include "relapack.h" #include "stdlib.h" - -static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, - const int *, double *, const int *, int *, double *, const int *, double *, - const int *, int *); +static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, + const blasint *, blasint *); /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +12,9 @@ static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html * */ void RELAPACK_dgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { // Check arguments @@ -31,8 +30,8 @@ void RELAPACK_dgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DGBTRF", &minfo, strlen("DGBTRF")); return; } @@ -40,14 +39,14 @@ void RELAPACK_dgbtrf( const double ZERO[] = { 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { double *const A_j = A + *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +54,12 @@ void RELAPACK_dgbtrf( } // Allocate work space - const int n1 = DREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = DREC_SPLIT(*n); + const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv); + const blasint nWorkl = abs( (kv > n1) ? n1 : kv); + const blasint mWorku = abs( (*kl > n1) ? n1 : *kl); +// const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl); + const blasint nWorku = abs( (*kl > n1) ? MAX(1, *n - *kl) : *kl); double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); double *Worku = malloc(mWorku * nWorku * sizeof(double)); LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_dgbtrf( /** dgbtrf's recursive compute kernel */ static void RELAPACK_dgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_dgbtrf_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + kv; // Splitting - const int n1 = MIN(DREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(DREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_dgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_dgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_dgbtrf_rec( for (j = 0; j < n22; j++) { double *const A_Rrj = A_Rr + *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const double tmp = A_Rrj[i]; A_Rrj[i] = A_Rr[ip]; @@ -208,7 +208,7 @@ static void RELAPACK_dgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c index 9c925b586..1ceab6c37 100644 --- a/relapack/src/dgemmt.c +++ b/relapack/src/dgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); /** DGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_dgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,13 +32,13 @@ void RELAPACK_dgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !notransA) @@ -56,7 +56,7 @@ void RELAPACK_dgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("DGEMMT", &info); + LAPACK(xerbla)("DGEMMT", &info, strlen("DGEMMT")); return; } @@ -74,10 +74,10 @@ void RELAPACK_dgemmt( /** dgemmt's recursive compute kernel */ static void RELAPACK_dgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { @@ -87,8 +87,8 @@ static void RELAPACK_dgemmt_rec( } // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -124,16 +124,16 @@ static void RELAPACK_dgemmt_rec( /** dgemmt's unblocked compute kernel */ static void RELAPACK_dgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -149,13 +149,13 @@ static void RELAPACK_dgemmt_rec2( double *const C_ii = C + *ldC * i + i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index 07f5472fd..c4bce8fc5 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dgetrf_rec(const int *, const int *, double *, - const int *, int *, int *); +static void RELAPACK_dgetrf_rec(const blasint *, const blasint *, double *, + const blasint *, blasint *, blasint *); /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_dgetrf_rec(const int *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html * */ void RELAPACK_dgetrf( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_dgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_dgetrf( if (*m < *n) { // Constants const double ONE[] = { 1. }; - const int iONE[] = { 1. }; + const blasint iONE[] = { 1. }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const double *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_dgetrf( /** dgetrf's recursive compute kernel */ static void RELAPACK_dgetrf_rec( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_DGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_dgetrf_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R double *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_dgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_dgetrf_rec( // apply pivots to A_BL LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c index d722ea809..6c7dcccb3 100644 --- a/relapack/src/dlauum.c +++ b/relapack/src/dlauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dlauum_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_dlauum_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_dlauum_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html * */ void RELAPACK_dlauum( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_dlauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DLAUUM", &minfo, strlen("DLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_dlauum( /** dlauum's recursive compute kernel */ static void RELAPACK_dlauum_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_dlauum_rec( const double ONE[] = { 1. }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c index 6fd0ebe48..9380b28ad 100644 --- a/relapack/src/dpbtrf.c +++ b/relapack/src/dpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, - double *, const int *, double *, const int *, int *); +static void RELAPACK_dpbtrf_rec(const char *, const blasint *, const blasint *, + double *, const blasint *, double *, const blasint *, blasint *); /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html * */ void RELAPACK_dpbtrf( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_dpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DPBTRF", &minfo, strlen("DPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_dpbtrf( const double ZERO[] = { 0. }; // Allocate work space - const int n1 = DREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = DREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; double *Work = malloc(mWork * nWork * sizeof(double)); LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_dpbtrf( /** dpbtrf's recursive compute kernel */ static void RELAPACK_dpbtrf_rec( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - double *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + double *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_dpbtrf_rec( const double MONE[] = { -1. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(DREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(DREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_dpbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, n1); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, n1); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c index c14fb3d71..cf326b18f 100644 --- a/relapack/src/dpotrf.c +++ b/relapack/src/dpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dpotrf_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_dpotrf_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_dpotrf_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html * */ void RELAPACK_dpotrf( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_dpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DPOTRF", &minfo, strlen("DPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_dpotrf( /** dpotrf's recursive compute kernel */ static void RELAPACK_dpotrf_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_dpotrf_rec( const double MONE[] = { -1. }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c index 0228068ce..f68241e3a 100644 --- a/relapack/src/dsygst.c +++ b/relapack/src/dsygst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_dsygst_rec(const int *, const char *, const int *, - double *, const int *, const double *, const int *, - double *, const int *, int *); +static void RELAPACK_dsygst_rec(const blasint *, const char *, const blasint *, + double *, const blasint *, const double *, const blasint *, + double *, const blasint *, blasint *); /** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_dsygst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html * */ void RELAPACK_dsygst( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_dsygst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DSYGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DSYGST", &minfo, strlen("DSYGST")); return; } @@ -45,10 +45,10 @@ void RELAPACK_dsygst( // Allocate work space double *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = DREC_SPLIT(*n); - lWork = n1 * (*n - n1); + const blasint n1 = DREC_SPLIT(*n); + lWork = abs( n1 * (*n - n1) ); Work = malloc(lWork * sizeof(double)); if (!Work) lWork = 0; @@ -67,9 +67,9 @@ void RELAPACK_dsygst( /** dsygst's recursive compute kernel */ static void RELAPACK_dsygst_rec( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - double *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_SSYGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_dsygst_rec( const double MONE[] = { -1. }; const double HALF[] = { .5 }; const double MHALF[] = { -.5 }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c index 80b119336..43d28f94e 100644 --- a/relapack/src/dsytrf.c +++ b/relapack/src/dsytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_dsytrf_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html * */ void RELAPACK_dsytrf( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_dsytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_dsytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_dsytrf( /** dsytrf's recursive compute kernel */ static void RELAPACK_dsytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_DSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_dsytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = DREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = DREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_dsytrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + n1; double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_dsytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = DREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = DREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_dsytrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c index 72ef827b1..6ed1a47a2 100644 --- a/relapack/src/dsytrf_rec2.c +++ b/relapack/src/dsytrf_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static double c_b8 = -1.; static double c_b9 = 1.; @@ -25,33 +25,33 @@ static double c_b9 = 1.; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int * - nb, int *kb, double *a, int *lda, int *ipiv, - double *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, double *a, blasint *lda, blasint *ipiv, + double *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; double d__1, d__2, d__3; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k; + static blasint j, k; static double t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; - extern /* Subroutine */ int dscal_(int *, double *, double *, - int *); + extern /* Subroutine */ blasint dscal_(int *, double *, double *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int dgemv_(char *, int *, int *, - double *, double *, int *, double *, int *, - double *, double *, int *, ftnlen), dcopy_(int *, - double *, int *, double *, int *), dswap_(int - *, double *, int *, double *, int *); - static int kstep; + extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *, ftnlen), dcopy_(int *, + double *, blasint *, double *, blasint *), dswap_(int + *, double *, blasint *, double *, blasint *); + static blasint kstep; static double absakk; - extern int idamax_(int *, double *, int *); + extern blasint idamax_(int *, double *, blasint *); static double colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c index 19a875c7a..78fa652ab 100644 --- a/relapack/src/dsytrf_rook.c +++ b/relapack/src/dsytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_dsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html * */ void RELAPACK_dsytrf_rook( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_dsytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_dsytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_dsytrf_rook( /** dsytrf_rook's recursive compute kernel */ static void RELAPACK_dsytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_dsytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = DREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = DREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_dsytrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + n1; double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_dsytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_dsytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = DREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = DREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_dsytrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c index 105ef5ed3..bdb5c6e29 100644 --- a/relapack/src/dsytrf_rook_rec2.c +++ b/relapack/src/dsytrf_rook_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static double c_b9 = -1.; static double c_b10 = 1.; @@ -25,39 +25,39 @@ static double c_b10 = 1.; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, double *a, int *lda, int *ipiv, - double *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, double *a, blasint *lda, blasint *ipiv, + double *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; double d__1; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static double t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static double alpha; - extern /* Subroutine */ int dscal_(int *, double *, double *, - int *); + extern /* Subroutine */ blasint dscal_(int *, double *, double *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int dgemv_(char *, int *, int *, - double *, double *, int *, double *, int *, - double *, double *, int *, ftnlen); + extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *, ftnlen); static double dtemp, sfmin; - static int itemp; - extern /* Subroutine */ int dcopy_(int *, double *, int *, - double *, int *), dswap_(int *, double *, int - *, double *, int *); - static int kstep; + static blasint itemp; + extern /* Subroutine */ blasint dcopy_(int *, double *, blasint *, + double *, blasint *), dswap_(int *, double *, int + *, double *, blasint *); + static blasint kstep; extern double dlamch_(char *, ftnlen); static double absakk; - extern int idamax_(int *, double *, int *); + extern blasint idamax_(int *, double *, blasint *); static double colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c index c506926af..9bbc987e7 100644 --- a/relapack/src/dtgsyl.c +++ b/relapack/src/dtgsyl.c @@ -1,11 +1,11 @@ #include "relapack.h" #include -static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, - const int *, const double *, const int *, const double *, const int *, - double *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, double *, double *, int *, - int *, int *); +static void RELAPACK_dtgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const double *, const blasint *, const double *, const blasint *, + double *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, double *, double *, blasint *, + blasint *, blasint *); /** DTGSYL solves the generalized Sylvester equation. @@ -15,21 +15,21 @@ static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html * */ void RELAPACK_dtgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "T"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "T"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -58,8 +58,8 @@ void RELAPACK_dtgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DTGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DTGSYL", &minfo, strlen("DTGSYL")); return; } @@ -75,8 +75,8 @@ void RELAPACK_dtgsyl( // Constant const double ZERO[] = { 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -87,12 +87,12 @@ void RELAPACK_dtgsyl( } double scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; double dscale = 0; double dsum = 1; - int pq; + blasint pq; RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); if (dscale != 0) { if (*ijob == 1 || *ijob == 3) @@ -121,13 +121,13 @@ void RELAPACK_dtgsyl( /** dtgsyl's recursive vompute kernel */ static void RELAPACK_dtgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dsum, double *dscale, - int *iWork, int *pq, int *info + blasint *iWork, blasint *pq, blasint *info ) { if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) { @@ -139,20 +139,20 @@ static void RELAPACK_dtgsyl_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1. }; double scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = DREC_SPLIT(*m); + blasint m1 = DREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -210,10 +210,10 @@ static void RELAPACK_dtgsyl_rec( } } else { // Splitting - int n1 = DREC_SPLIT(*n); + blasint n1 = DREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c index c87b53ae5..766377300 100644 --- a/relapack/src/dtrsyl.c +++ b/relapack/src/dtrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, - const int *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, int *); +static void RELAPACK_dtrsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, blasint *); /** DTRSYL solves the real Sylvester matrix equation. @@ -12,20 +12,20 @@ static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html * */ void RELAPACK_dtrsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int transA = LAPACK(lsame)(tranA, "T"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int transB = LAPACK(lsame)(tranB, "T"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint transA = LAPACK(lsame)(tranA, "T"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint transB = LAPACK(lsame)(tranB, "T"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!transA && !ctransA && !notransA) *info = -1; @@ -44,8 +44,8 @@ void RELAPACK_dtrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DTRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DTRSYL", &minfo, strlen("DTRSYL")); return; } @@ -60,11 +60,11 @@ void RELAPACK_dtrsyl( /** dtrsyl's recursive compute kernel */ static void RELAPACK_dtrsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) { @@ -77,20 +77,20 @@ static void RELAPACK_dtrsyl_rec( const double ONE[] = { 1. }; const double MONE[] = { -1. }; const double MSGN[] = { -*isgn }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1. }; double scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = DREC_SPLIT(*m); + blasint m1 = DREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -126,10 +126,10 @@ static void RELAPACK_dtrsyl_rec( } } else { // Splitting - int n1 = DREC_SPLIT(*n); + blasint n1 = DREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c index 479c7f340..50dabf76d 100644 --- a/relapack/src/dtrsyl_rec2.c +++ b/relapack/src/dtrsyl_rec2.c @@ -14,52 +14,52 @@ /* Table of constant values */ -static int c__1 = 1; -static int c_false = FALSE_; -static int c__2 = 2; +static blasint c__1 = 1; +static blasint c_false = FALSE_; +static blasint c__2 = 2; static double c_b26 = 1.; static double c_b30 = 0.; -static int c_true = TRUE_; +static blasint c_true = TRUE_; -int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int - *m, int *n, double *a, int *lda, double *b, int * - ldb, double *c__, int *ldc, double *scale, int *info, +int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, blasint *isgn, int + *m, blasint *n, double *a, blasint *lda, double *b, blasint * + ldb, double *c__, blasint *ldc, double *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; double d__1, d__2; /* Local variables */ - static int j, k, l; + static blasint j, k, l; static double x[4] /* was [2][2] */; - static int k1, k2, l1, l2; + static blasint k1, k2, l1, l2; static double a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; - extern double ddot_(int *, double *, int *, double *, - int *); - static int ierr; + extern double ddot_(int *, double *, blasint *, double *, + blasint *); + static blasint ierr; static double smin, suml, sumr; - extern /* Subroutine */ int dscal_(int *, double *, double *, - int *); - extern int lsame_(char *, char *, ftnlen, ftnlen); - static int knext, lnext; + extern /* Subroutine */ blasint dscal_(int *, double *, double *, + blasint *); + extern blasint lsame_(char *, char *, ftnlen, ftnlen); + static blasint knext, lnext; static double xnorm; - extern /* Subroutine */ int dlaln2_(int *, int *, int *, - double *, double *, double *, int *, double *, - double *, double *, int *, double *, double * - , double *, int *, double *, double *, int *), - dlasy2_(int *, int *, int *, int *, int *, - double *, int *, double *, int *, double *, - int *, double *, double *, int *, double *, - int *), dlabad_(double *, double *); - extern double dlamch_(char *, ftnlen), dlange_(char *, int *, - int *, double *, int *, double *, ftnlen); + extern /* Subroutine */ blasint dlaln2_(int *, blasint *, blasint *, + double *, double *, double *, blasint *, double *, + double *, double *, blasint *, double *, double * + , double *, blasint *, double *, double *, blasint *), + dlasy2_(int *, blasint *, blasint *, blasint *, blasint *, + double *, blasint *, double *, blasint *, double *, + blasint *, double *, double *, blasint *, double *, + blasint *), dlabad_(double *, double *); + extern double dlamch_(char *, ftnlen), dlange_(char *, blasint *, + blasint *, double *, blasint *, double *, ftnlen); static double scaloc; - extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); static double bignum; - static int notrna, notrnb; + static blasint notrna, notrnb; static double smlnum; /* Parameter adjustments */ diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c index 0462609e9..72777e7e4 100644 --- a/relapack/src/dtrtri.c +++ b/relapack/src/dtrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, - double *, const int *, int *); +static void RELAPACK_dtrtri_rec(const char *, const char *, const blasint *, + double *, const blasint *, blasint *); /** DTRTRI computes the inverse of a real upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html * */ void RELAPACK_dtrtri( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_dtrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DTRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DTRTRI", &minfo, strlen("DTRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_dtrtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[i + *ldA * i] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_dtrtri( /** dtrtri's recursive compute kernel */ static void RELAPACK_dtrtri_rec( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_DTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_dtrtri_rec( const double MONE[] = { -1. }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c index 5a3452419..48539c4b9 100644 --- a/relapack/src/f2c.c +++ b/relapack/src/f2c.c @@ -9,7 +9,7 @@ #endif #endif -void sig_die(const char *s, int kill) { +void sig_die(const char *s, blasint kill) { /* print error message, then clear buffers */ fprintf(stderr, "%s\n", s); diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h index b94ee7c8e..85337becf 100644 --- a/relapack/src/f2c.h +++ b/relapack/src/f2c.h @@ -7,6 +7,19 @@ #ifndef F2C_INCLUDE #define F2C_INCLUDE +#ifdef USE64BITINT +typedef BLASLONG blasint; +#if defined(OS_WINDOWS) && defined(__64BIT__) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + + typedef long int integer; typedef unsigned long int uinteger; typedef char *address; diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h index 064276b7e..776b0589f 100644 --- a/relapack/src/lapack.h +++ b/relapack/src/lapack.h @@ -1,80 +1,80 @@ #ifndef LAPACK_H #define LAPACK_H -extern int LAPACK(lsame)(const char *, const char *); -extern int LAPACK(xerbla)(const char *, const int *); +extern blasint LAPACK(lsame)(const char *, const char *); +extern blasint LAPACK(xerbla)(const char *, const blasint *, int); -extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); -extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); -extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); -extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(slaswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); +extern void LAPACK(dlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); +extern void LAPACK(claswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); +extern void LAPACK(zlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); -extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); -extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); -extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); -extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); +extern void LAPACK(slaset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *); +extern void LAPACK(dlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *); +extern void LAPACK(claset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *); +extern void LAPACK(zlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *); -extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); -extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); -extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); -extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); +extern void LAPACK(slacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *); +extern void LAPACK(dlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *); +extern void LAPACK(clacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *); +extern void LAPACK(zlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *); -extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); -extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(slascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(clascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *); -extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(slauu2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dlauu2)(const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(clauu2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zlauu2)(const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); -extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +extern void LAPACK(ssygs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +extern void LAPACK(dsygs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); +extern void LAPACK(chegs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +extern void LAPACK(zhegs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); -extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *); -extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *); -extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *); -extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *); +extern void LAPACK(strti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dtrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(ctrti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(ztrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *); -extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(spotf2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dpotf2)(const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(cpotf2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zpotf2)(const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *); -extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(spbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(cpbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(ssytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(csytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(chetf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(zhetf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(ssytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(csytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(chetf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(zhetf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *); -extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(sgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(cgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); -extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(sgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(cgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *); -extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *); -extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *); -extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *); +extern void LAPACK(stgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *, blasint *); +extern void LAPACK(dtgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *, blasint *, blasint *); +extern void LAPACK(ctgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *); +extern void LAPACK(ztgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *); #endif /* LAPACK_H */ diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c index 488547260..0252f3d92 100644 --- a/relapack/src/lapack_wrappers.c +++ b/relapack/src/lapack_wrappers.c @@ -6,9 +6,9 @@ #if INCLUDE_SLAUUM void LAPACK(slauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_slauum(uplo, n, A, ldA, info); } @@ -16,9 +16,9 @@ void LAPACK(slauum)( #if INCLUDE_DLAUUM void LAPACK(dlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_dlauum(uplo, n, A, ldA, info); } @@ -26,9 +26,9 @@ void LAPACK(dlauum)( #if INCLUDE_CLAUUM void LAPACK(clauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_clauum(uplo, n, A, ldA, info); } @@ -36,9 +36,9 @@ void LAPACK(clauum)( #if INCLUDE_ZLAUUM void LAPACK(zlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_zlauum(uplo, n, A, ldA, info); } @@ -51,9 +51,9 @@ void LAPACK(zlauum)( #if INCLUDE_SSYGST void LAPACK(ssygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -61,9 +61,9 @@ void LAPACK(ssygst)( #if INCLUDE_DSYGST void LAPACK(dsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -71,9 +71,9 @@ void LAPACK(dsygst)( #if INCLUDE_CHEGST void LAPACK(chegst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -81,9 +81,9 @@ void LAPACK(chegst)( #if INCLUDE_ZHEGST void LAPACK(zhegst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -96,9 +96,9 @@ void LAPACK(zhegst)( #if INCLUDE_STRTRI void LAPACK(strtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_strtri(uplo, diag, n, A, ldA, info); } @@ -106,9 +106,9 @@ void LAPACK(strtri)( #if INCLUDE_DTRTRI void LAPACK(dtrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); } @@ -116,9 +116,9 @@ void LAPACK(dtrtri)( #if INCLUDE_CTRTRI void LAPACK(ctrtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); } @@ -126,9 +126,9 @@ void LAPACK(ctrtri)( #if INCLUDE_ZTRTRI void LAPACK(ztrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); } @@ -141,9 +141,9 @@ void LAPACK(ztrtri)( #if INCLUDE_SPOTRF void LAPACK(spotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_spotrf(uplo, n, A, ldA, info); } @@ -151,9 +151,9 @@ void LAPACK(spotrf)( #if INCLUDE_DPOTRF void LAPACK(dpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_dpotrf(uplo, n, A, ldA, info); } @@ -161,9 +161,9 @@ void LAPACK(dpotrf)( #if INCLUDE_CPOTRF void LAPACK(cpotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_cpotrf(uplo, n, A, ldA, info); } @@ -171,9 +171,9 @@ void LAPACK(cpotrf)( #if INCLUDE_ZPOTRF void LAPACK(zpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_zpotrf(uplo, n, A, ldA, info); } @@ -186,9 +186,9 @@ void LAPACK(zpotrf)( #if INCLUDE_SPBTRF void LAPACK(spbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -196,9 +196,9 @@ void LAPACK(spbtrf)( #if INCLUDE_DPBTRF void LAPACK(dpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -206,9 +206,9 @@ void LAPACK(dpbtrf)( #if INCLUDE_CPBTRF void LAPACK(cpbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -216,9 +216,9 @@ void LAPACK(cpbtrf)( #if INCLUDE_ZPBTRF void LAPACK(zpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -231,9 +231,9 @@ void LAPACK(zpbtrf)( #if INCLUDE_SSYTRF void LAPACK(ssytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -241,9 +241,9 @@ void LAPACK(ssytrf)( #if INCLUDE_DSYTRF void LAPACK(dsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -251,9 +251,9 @@ void LAPACK(dsytrf)( #if INCLUDE_CSYTRF void LAPACK(csytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -261,9 +261,9 @@ void LAPACK(csytrf)( #if INCLUDE_ZSYTRF void LAPACK(zsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -271,9 +271,9 @@ void LAPACK(zsytrf)( #if INCLUDE_CHETRF void LAPACK(chetrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -281,9 +281,9 @@ void LAPACK(chetrf)( #if INCLUDE_ZHETRF void LAPACK(zhetrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -291,9 +291,9 @@ void LAPACK(zhetrf)( #if INCLUDE_SSYTRF_ROOK void LAPACK(ssytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -301,9 +301,9 @@ void LAPACK(ssytrf_rook)( #if INCLUDE_DSYTRF_ROOK void LAPACK(dsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -311,9 +311,9 @@ void LAPACK(dsytrf_rook)( #if INCLUDE_CSYTRF_ROOK void LAPACK(csytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -321,9 +321,9 @@ void LAPACK(csytrf_rook)( #if INCLUDE_ZSYTRF_ROOK void LAPACK(zsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -331,9 +331,9 @@ void LAPACK(zsytrf_rook)( #if INCLUDE_CHETRF_ROOK void LAPACK(chetrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -341,9 +341,9 @@ void LAPACK(chetrf_rook)( #if INCLUDE_ZHETRF_ROOK void LAPACK(zhetrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -356,9 +356,9 @@ void LAPACK(zhetrf_rook)( #if INCLUDE_SGETRF void LAPACK(sgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); } @@ -366,9 +366,9 @@ void LAPACK(sgetrf)( #if INCLUDE_DGETRF void LAPACK(dgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); } @@ -376,9 +376,9 @@ void LAPACK(dgetrf)( #if INCLUDE_CGETRF void LAPACK(cgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); } @@ -386,9 +386,9 @@ void LAPACK(cgetrf)( #if INCLUDE_ZGETRF void LAPACK(zgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); } @@ -401,9 +401,9 @@ void LAPACK(zgetrf)( #if INCLUDE_SGBTRF void LAPACK(sgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -411,9 +411,9 @@ void LAPACK(sgbtrf)( #if INCLUDE_DGBTRF void LAPACK(dgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -421,9 +421,9 @@ void LAPACK(dgbtrf)( #if INCLUDE_CGBTRF void LAPACK(cgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -431,9 +431,9 @@ void LAPACK(cgbtrf)( #if INCLUDE_ZGBTRF void LAPACK(zgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -446,11 +446,11 @@ void LAPACK(zgbtrf)( #if INCLUDE_STRSYL void LAPACK(strsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -458,11 +458,11 @@ void LAPACK(strsyl)( #if INCLUDE_DTRSYL void LAPACK(dtrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -470,11 +470,11 @@ void LAPACK(dtrsyl)( #if INCLUDE_CTRSYL void LAPACK(ctrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -482,11 +482,11 @@ void LAPACK(ctrsyl)( #if INCLUDE_ZTRSYL void LAPACK(ztrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -499,13 +499,13 @@ void LAPACK(ztrsyl)( #if INCLUDE_STGSYL void LAPACK(stgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -513,13 +513,13 @@ void LAPACK(stgsyl)( #if INCLUDE_DTGSYL void LAPACK(dtgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -527,13 +527,13 @@ void LAPACK(dtgsyl)( #if INCLUDE_CTGSYL void LAPACK(ctgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -541,13 +541,13 @@ void LAPACK(ctgsyl)( #if INCLUDE_ZTGSYL void LAPACK(ztgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -561,10 +561,10 @@ void LAPACK(ztgsyl)( #if INCLUDE_SGEMMT void LAPACK(sgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { RELAPACK_sgemmt(uplo, n, A, ldA, info); } @@ -573,10 +573,10 @@ void LAPACK(sgemmt)( #if INCLUDE_DGEMMT void LAPACK(dgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { RELAPACK_dgemmt(uplo, n, A, ldA, info); } @@ -585,10 +585,10 @@ void LAPACK(dgemmt)( #if INCLUDE_CGEMMT void LAPACK(cgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { RELAPACK_cgemmt(uplo, n, A, ldA, info); } @@ -597,10 +597,10 @@ void LAPACK(cgemmt)( #if INCLUDE_ZGEMMT void LAPACK(zgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { RELAPACK_zgemmt(uplo, n, A, ldA, info); } diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h index 2cb061c32..38c5c30d0 100644 --- a/relapack/src/relapack.h +++ b/relapack/src/relapack.h @@ -1,6 +1,14 @@ #ifndef RELAPACK_INT_H #define RELAPACK_INT_H - +#include +#include "../../config.h" +#if defined(OS_WINDOWS) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif #include "../config.h" #include "../inc/relapack.h" @@ -38,23 +46,23 @@ #include "blas.h" // sytrf helper routines -void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_ssytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); // trsyl helper routines -void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); -void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_strsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_dtrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); +void RELAPACK_ctrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_ztrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); #endif /* RELAPACK_INT_H */ diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index bc20e744b..3a4de4ece 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, - const int *, float *, const int *, int *, float *, const int *, float *, - const int *, int *); +static void RELAPACK_sgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *, + const blasint *, blasint *); /** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,11 +13,10 @@ static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html * */ void RELAPACK_sgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { - // Check arguments *info = 0; if (*m < 0) @@ -31,8 +30,8 @@ void RELAPACK_sgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SGBTRF", &minfo, strlen("SGBTRF")); return; } @@ -40,14 +39,14 @@ void RELAPACK_sgbtrf( const float ZERO[] = { 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskewg A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { float *const A_j = A + *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +54,11 @@ void RELAPACK_sgbtrf( } // Allocate work space - const int n1 = SREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = SREC_SPLIT(*n); + const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const blasint nWorkl = (kv > n1) ? n1 : kv; + const blasint mWorku = (*kl > n1) ? n1 : *kl; + const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); float *Worku = malloc(mWorku * nWorku * sizeof(float)); LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +75,10 @@ void RELAPACK_sgbtrf( /** sgbtrf's recursive compute kernel */ static void RELAPACK_sgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { @@ -91,25 +90,25 @@ static void RELAPACK_sgbtrf_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + kv; // Splitting - const int n1 = MIN(SREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(SREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +128,14 @@ static void RELAPACK_sgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +163,7 @@ static void RELAPACK_sgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -180,7 +179,7 @@ static void RELAPACK_sgbtrf_rec( for (j = 0; j < n22; j++) { float *const A_Rrj = A_Rr + *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const float tmp = A_Rrj[i]; A_Rrj[i] = A_Rr[ip]; @@ -208,7 +207,7 @@ static void RELAPACK_sgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c index 75f78fabd..93438858c 100644 --- a/relapack/src/sgemmt.c +++ b/relapack/src/sgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_sgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); /** SGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_sgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,13 +32,13 @@ void RELAPACK_sgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !notransA) @@ -56,7 +56,7 @@ void RELAPACK_sgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("SGEMMT", &info); + LAPACK(xerbla)("SGEMMT", &info, strlen("SGEMMT")); return; } @@ -74,10 +74,10 @@ void RELAPACK_sgemmt( /** sgemmt's recursive compute kernel */ static void RELAPACK_sgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_SGEMMT, 1)) { @@ -87,8 +87,8 @@ static void RELAPACK_sgemmt_rec( } // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -124,16 +124,16 @@ static void RELAPACK_sgemmt_rec( /** sgemmt's unblocked compute kernel */ static void RELAPACK_sgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -149,13 +149,13 @@ static void RELAPACK_sgemmt_rec2( float *const C_ii = C + *ldC * i + i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 284f8cff6..9d0ff1039 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, - int *, int *); +static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *, + blasint *, blasint *); /** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html * */ void RELAPACK_sgetrf( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_sgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_sgetrf( if (*m < *n) { // Constants const float ONE[] = { 1. }; - const int iONE[] = { 1. }; + const blasint iONE[] = { 1. }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const float *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_sgetrf( /** sgetrf's recursive compute kernel */ static void RELAPACK_sgetrf_rec( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_SGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_sgetrf_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R float *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_sgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_sgetrf_rec( // apply pivots to A_BL LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c index 280f141b3..79212817f 100644 --- a/relapack/src/slauum.c +++ b/relapack/src/slauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_slauum_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_slauum_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_slauum_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html * */ void RELAPACK_slauum( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_slauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SLAUUM", &minfo, strlen("SLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_slauum( /** slauum's recursive compute kernel */ static void RELAPACK_slauum_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_SLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_slauum_rec( const float ONE[] = { 1. }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c index ee0a5546e..26804dcc2 100644 --- a/relapack/src/spbtrf.c +++ b/relapack/src/spbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, - float *, const int *, float *, const int *, int *); +static void RELAPACK_spbtrf_rec(const char *, const blasint *, const blasint *, + float *, const blasint *, float *, const blasint *, blasint *); /** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html * */ void RELAPACK_spbtrf( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_spbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SPBTRF", &minfo, strlen("SPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_spbtrf( const float ZERO[] = { 0. }; // Allocate work space - const int n1 = SREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = SREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; float *Work = malloc(mWork * nWork * sizeof(float)); LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_spbtrf( /** spbtrf's recursive compute kernel */ static void RELAPACK_spbtrf_rec( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - float *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + float *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_spbtrf_rec( const float MONE[] = { -1. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(SREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(SREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_spbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, *kd); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c index 2a609321b..b22e917f7 100644 --- a/relapack/src/spotrf.c +++ b/relapack/src/spotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_spotrf_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_spotrf_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_spotrf_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html * */ void RELAPACK_spotrf( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_spotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SPOTRF", &minfo, strlen("SPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_spotrf( /** spotrf's recursive compute kernel */ static void RELAPACK_spotrf_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_SPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_spotrf_rec( const float MONE[] = { -1. }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c index 7f145cdec..4259f9031 100644 --- a/relapack/src/ssygst.c +++ b/relapack/src/ssygst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_ssygst_rec(const int *, const char *, const int *, - float *, const int *, const float *, const int *, - float *, const int *, int *); +static void RELAPACK_ssygst_rec(const blasint *, const char *, const blasint *, + float *, const blasint *, const float *, const blasint *, + float *, const blasint *, blasint *); /** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_ssygst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html * */ void RELAPACK_ssygst( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_ssygst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SSYGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SSYGST", &minfo, strlen("SSYGST")); return; } @@ -45,9 +45,9 @@ void RELAPACK_ssygst( // Allocate work space float *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = SREC_SPLIT(*n); + const blasint n1 = SREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * sizeof(float)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_ssygst( /** ssygst's recursive compute kernel */ static void RELAPACK_ssygst_rec( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - float *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_SSYGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_ssygst_rec( const float MONE[] = { -1. }; const float HALF[] = { .5 }; const float MHALF[] = { -.5 }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c index 8a4fad9f2..9fe7ce4a6 100644 --- a/relapack/src/ssytrf.c +++ b/relapack/src/ssytrf.c @@ -2,9 +2,8 @@ #if XSYTRF_ALLOW_MALLOC #include #endif - -static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_ssytrf_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +13,21 @@ static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html * */ void RELAPACK_ssytrf( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +54,8 @@ void RELAPACK_ssytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); return; } @@ -64,7 +63,7 @@ void RELAPACK_ssytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +77,13 @@ void RELAPACK_ssytrf( /** ssytrf's recursive compute kernel */ static void RELAPACK_ssytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_SSYTRF, 3)) { // Unblocked @@ -96,34 +95,34 @@ static void RELAPACK_ssytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = SREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = SREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +138,23 @@ static void RELAPACK_ssytrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + n1; float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -182,22 +181,22 @@ static void RELAPACK_ssytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = SREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = SREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +212,19 @@ static void RELAPACK_ssytrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c index edc9269ec..13856f064 100644 --- a/relapack/src/ssytrf_rec2.c +++ b/relapack/src/ssytrf_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static float c_b8 = -1.f; static float c_b9 = 1.f; @@ -25,32 +25,32 @@ static float c_b9 = 1.f; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int * - nb, int *kb, float *a, int *lda, int *ipiv, float *w, - int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float *w, + int *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; float r__1, r__2, r__3; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k; + static blasint j, k; static float t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int sscal_(int *, float *, float *, int *), - sgemv_(char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *, ftnlen); - static int kstep; - extern /* Subroutine */ int scopy_(int *, float *, int *, float *, - int *), sswap_(int *, float *, int *, float *, int * + extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *), + sgemv_(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *, ftnlen); + static blasint kstep; + extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *, + blasint *), sswap_(int *, float *, blasint *, float *, blasint * ); static float absakk; - extern int isamax_(int *, float *, int *); + extern blasint isamax_(int *, float *, blasint *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c index 040df2484..abcf29d1c 100644 --- a/relapack/src/ssytrf_rook.c +++ b/relapack/src/ssytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_ssytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html * */ void RELAPACK_ssytrf_rook( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_ssytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_ssytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_ssytrf_rook( /** ssytrf_rook's recursive compute kernel */ static void RELAPACK_ssytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_ssytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = SREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = SREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_ssytrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + n1; float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_ssytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_ssytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = SREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = SREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_ssytrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c index 3308826d7..41659cb3e 100644 --- a/relapack/src/ssytrf_rook_rec2.c +++ b/relapack/src/ssytrf_rook_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static float c_b9 = -1.f; static float c_b10 = 1.f; @@ -25,39 +25,39 @@ static float c_b10 = 1.f; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, float *a, int *lda, int *ipiv, float * - w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float * + w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; float r__1; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static float t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int sscal_(int *, float *, float *, int *); + extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *); static float sfmin; - static int itemp; - extern /* Subroutine */ int sgemv_(char *, int *, int *, float *, - float *, int *, float *, int *, float *, float *, int *, + static blasint itemp; + extern /* Subroutine */ blasint sgemv_(char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *, ftnlen); - static int kstep; + static blasint kstep; static float stemp; - extern /* Subroutine */ int scopy_(int *, float *, int *, float *, - int *), sswap_(int *, float *, int *, float *, int * + extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *, + blasint *), sswap_(int *, float *, blasint *, float *, blasint * ); static float absakk; extern double slamch_(char *, ftnlen); - extern int isamax_(int *, float *, int *); + extern blasint isamax_(int *, float *, blasint *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c index 1870fb928..6bace9f17 100644 --- a/relapack/src/stgsyl.c +++ b/relapack/src/stgsyl.c @@ -1,11 +1,11 @@ #include "relapack.h" #include -static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, - const int *, const float *, const int *, const float *, const int *, - float *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, float *, float *, int *, int *, - int *); +static void RELAPACK_stgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const float *, const blasint *, const float *, const blasint *, + float *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *, + blasint *); /** STGSYL solves the generalized Sylvester equation. @@ -15,21 +15,21 @@ static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html * */ void RELAPACK_stgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "T"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "T"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -58,8 +58,8 @@ void RELAPACK_stgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("STGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("STGSYL", &minfo, strlen("STGSYL")); return; } @@ -75,8 +75,8 @@ void RELAPACK_stgsyl( // Constant const float ZERO[] = { 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -87,12 +87,12 @@ void RELAPACK_stgsyl( } float scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; float dscale = 0; float dsum = 1; - int pq; + blasint pq; RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); if (dscale != 0) { if (*ijob == 1 || *ijob == 3) @@ -121,13 +121,13 @@ void RELAPACK_stgsyl( /** stgsyl's recursive vompute kernel */ static void RELAPACK_stgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dsum, float *dscale, - int *iWork, int *pq, int *info + blasint *iWork, blasint *pq, blasint *info ) { if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) { @@ -139,20 +139,20 @@ static void RELAPACK_stgsyl_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1. }; float scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = SREC_SPLIT(*m); + blasint m1 = SREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -210,10 +210,10 @@ static void RELAPACK_stgsyl_rec( } } else { // Splitting - int n1 = SREC_SPLIT(*n); + blasint n1 = SREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c index 83947ef1a..012fb3548 100644 --- a/relapack/src/strsyl.c +++ b/relapack/src/strsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_strsyl_rec(const char *, const char *, const int *, - const int *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, int *); +static void RELAPACK_strsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, blasint *); /** STRSYL solves the real Sylvester matrix equation. @@ -12,20 +12,20 @@ static void RELAPACK_strsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html * */ void RELAPACK_strsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int transA = LAPACK(lsame)(tranA, "T"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int transB = LAPACK(lsame)(tranB, "T"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint transA = LAPACK(lsame)(tranA, "T"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint transB = LAPACK(lsame)(tranB, "T"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!transA && !ctransA && !notransA) *info = -1; @@ -44,8 +44,8 @@ void RELAPACK_strsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("STRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("STRSYL", &minfo, strlen("STRSYL")); return; } @@ -60,11 +60,11 @@ void RELAPACK_strsyl( /** strsyl's recursive compute kernel */ static void RELAPACK_strsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) { @@ -77,20 +77,20 @@ static void RELAPACK_strsyl_rec( const float ONE[] = { 1. }; const float MONE[] = { -1. }; const float MSGN[] = { -*isgn }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1. }; float scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = SREC_SPLIT(*m); + blasint m1 = SREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -126,10 +126,10 @@ static void RELAPACK_strsyl_rec( } } else { // Splitting - int n1 = SREC_SPLIT(*n); + blasint n1 = SREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c index 6d40a475d..37a24c7dc 100644 --- a/relapack/src/strsyl_rec2.c +++ b/relapack/src/strsyl_rec2.c @@ -14,48 +14,48 @@ /* Table of constant values */ -static int c__1 = 1; -static int c_false = FALSE_; -static int c__2 = 2; +static blasint c__1 = 1; +static blasint c_false = FALSE_; +static blasint c__2 = 2; static float c_b26 = 1.f; static float c_b30 = 0.f; -static int c_true = TRUE_; +static blasint c_true = TRUE_; -void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int - *m, int *n, float *a, int *lda, float *b, int *ldb, float * - c__, int *ldc, float *scale, int *info, ftnlen trana_len, +void RELAPACK_strsyl_rec2(char *trana, char *tranb, blasint *isgn, int + *m, blasint *n, float *a, blasint *lda, float *b, blasint *ldb, float * + c__, blasint *ldc, float *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; float r__1, r__2; /* Local variables */ - static int j, k, l; + static blasint j, k, l; static float x[4] /* was [2][2] */; - static int k1, k2, l1, l2; + static blasint k1, k2, l1, l2; static float a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; - static int ierr; + static blasint ierr; static float smin; - extern float sdot_(int *, float *, int *, float *, int *); + extern float sdot_(int *, float *, blasint *, float *, blasint *); static float suml, sumr; - extern int lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int sscal_(int *, float *, float *, int *); - static int knext, lnext; + extern blasint lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *); + static blasint knext, lnext; static float xnorm; - extern /* Subroutine */ int slaln2_(int *, int *, int *, float - *, float *, float *, int *, float *, float *, float *, int *, - float *, float *, float *, int *, float *, float *, int *), - slasy2_(int *, int *, int *, int *, int *, - float *, int *, float *, int *, float *, int *, float *, - float *, int *, float *, int *), slabad_(float *, float *); + extern /* Subroutine */ blasint slaln2_(int *, blasint *, blasint *, float + *, float *, float *, blasint *, float *, float *, float *, blasint *, + float *, float *, float *, blasint *, float *, float *, blasint *), + slasy2_(int *, blasint *, blasint *, blasint *, blasint *, + float *, blasint *, float *, blasint *, float *, blasint *, float *, + float *, blasint *, float *, blasint *), slabad_(float *, float *); static float scaloc; - extern float slamch_(char *, ftnlen), slange_(char *, int *, - int *, float *, int *, float *, ftnlen); - extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + extern float slamch_(char *, ftnlen), slange_(char *, blasint *, + blasint *, float *, blasint *, float *, ftnlen); + extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); static float bignum; - static int notrna, notrnb; + static blasint notrna, notrnb; static float smlnum; /* Parameter adjustments */ diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c index d35bbd49f..18d11f5eb 100644 --- a/relapack/src/strtri.c +++ b/relapack/src/strtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_strtri_rec(const char *, const char *, const int *, - float *, const int *, int *); +static void RELAPACK_strtri_rec(const char *, const char *, const blasint *, + float *, const blasint *, blasint *); /** CTRTRI computes the inverse of a real upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_strtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html * */ void RELAPACK_strtri( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_strtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("STRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("STRTRI", &minfo, strlen("STRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_strtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[i + *ldA * i] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_strtri( /** strtri's recursive compute kernel */ static void RELAPACK_strtri_rec( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_STRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_strtri_rec( const float MONE[] = { -1. }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index 3aa6bf531..0dd3fa7c3 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, - const int *, double *, const int *, int *, double *, const int *, double *, - const int *, int *); +static void RELAPACK_zgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, + const blasint *, blasint *); /** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +13,9 @@ static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html * */ void RELAPACK_zgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { // Check arguments @@ -31,8 +31,8 @@ void RELAPACK_zgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZGBTRF", &minfo, strlen("ZGBTRF")); return; } @@ -40,14 +40,14 @@ void RELAPACK_zgbtrf( const double ZERO[] = { 0., 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { double *const A_j = A + 2 * *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +55,11 @@ void RELAPACK_zgbtrf( } // Allocate work space - const int n1 = ZREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = ZREC_SPLIT(*n); + const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const blasint nWorkl = (kv > n1) ? n1 : kv; + const blasint mWorku = (*kl > n1) ? n1 : *kl; + const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_zgbtrf( /** zgbtrf's recursive compute kernel */ static void RELAPACK_zgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_zgbtrf_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * kv; // Splitting - const int n1 = MIN(ZREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(ZREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_zgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_zgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_zgbtrf_rec( for (j = 0; j < n22; j++) { double *const A_Rrj = A_Rr + 2 * *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const double tmpr = A_Rrj[2 * i]; const double tmpc = A_Rrj[2 * i + 1]; @@ -211,7 +211,7 @@ static void RELAPACK_zgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c index aa5930238..f53a3ca6f 100644 --- a/relapack/src/zgemmt.c +++ b/relapack/src/zgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_zgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); /** ZGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_zgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,15 +32,15 @@ void RELAPACK_zgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int ctransA = LAPACK(lsame)(transA, "C"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - const int ctransB = LAPACK(lsame)(transB, "C"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint ctransA = LAPACK(lsame)(transA, "C"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + const blasint ctransB = LAPACK(lsame)(transB, "C"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !ctransA && !notransA) @@ -58,7 +58,7 @@ void RELAPACK_zgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("ZGEMMT", &info); + LAPACK(xerbla)("ZGEMMT", &info, strlen("ZGEMMT")); return; } @@ -76,10 +76,10 @@ void RELAPACK_zgemmt( /** zgemmt's recursive compute kernel */ static void RELAPACK_zgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) { @@ -89,8 +89,8 @@ static void RELAPACK_zgemmt_rec( } // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -126,16 +126,16 @@ static void RELAPACK_zgemmt_rec( /** zgemmt's unblocked compute kernel */ static void RELAPACK_zgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -151,13 +151,13 @@ static void RELAPACK_zgemmt_rec2( double *const C_ii = C + 2 * *ldC * i + 2 * i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index cf8921e1f..121b03401 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zgetrf_rec(const int *, const int *, double *, - const int *, int *, int *); +static void RELAPACK_zgetrf_rec(const blasint *, const blasint *, double *, + const blasint *, blasint *, blasint *); /** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_zgetrf_rec(const int *, const int *, double *, * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html * */ void RELAPACK_zgetrf( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_zgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZGETRF", &minfo, strlen("ZGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_zgetrf( if (*m < *n) { // Constants const double ONE[] = { 1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const double *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_zgetrf( /** zgetrf's recursive compute kernel */ static void RELAPACK_zgetrf_rec( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_zgetrf_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1. }; + const blasint iONE[] = { 1. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R double *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_zgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_zgetrf_rec( // apply pivots to A_BL LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c index d0ece2148..dc9b7eace 100644 --- a/relapack/src/zhegst.c +++ b/relapack/src/zhegst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_zhegst_rec(const int *, const char *, const int *, - double *, const int *, const double *, const int *, - double *, const int *, int *); +static void RELAPACK_zhegst_rec(const blasint *, const char *, const blasint *, + double *, const blasint *, const double *, const blasint *, + double *, const blasint *, blasint *); /** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_zhegst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html * */ void RELAPACK_zhegst( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_zhegst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZHEGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZHEGST", &minfo, strlen("ZHEGST")); return; } @@ -45,9 +45,9 @@ void RELAPACK_zhegst( // Allocate work space double *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = ZREC_SPLIT(*n); + const blasint n1 = ZREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * 2 * sizeof(double)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_zhegst( /** zhegst's recursive compute kernel */ static void RELAPACK_zhegst_rec( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - double *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_ZHEGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_zhegst_rec( const double MONE[] = { -1., 0. }; const double HALF[] = { .5, 0. }; const double MHALF[] = { -.5, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c index ef4e1f5d5..3d458fecf 100644 --- a/relapack/src/zhetrf.c +++ b/relapack/src/zhetrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zhetrf_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html * */ void RELAPACK_zhetrf( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zhetrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zhetrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zhetrf( /** zhetrf's recursive compute kernel */ static void RELAPACK_zhetrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c index 867ea64e1..c14cf0440 100644 --- a/relapack/src/zhetrf_rec2.c +++ b/relapack/src/zhetrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int * - nb, int *kb, doublecomplex *a, int *lda, int *ipiv, - doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv, + doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2, d__3, d__4; doublecomplex z__1, z__2, z__3, z__4; @@ -39,26 +39,26 @@ static int c__1 = 1; doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k; + static blasint j, k; static double t, r1; static doublecomplex d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - static int kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + static blasint kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); static double absakk; - extern /* Subroutine */ int zdscal_(int *, double *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zdscal_(int *, double *, + doublecomplex *, blasint *); static double colmax; - extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *) ; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c index 15ceaeae7..285aea96e 100644 --- a/relapack/src/zhetrf_rook.c +++ b/relapack/src/zhetrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zhetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html * */ void RELAPACK_zhetrf_rook( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zhetrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zhetrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zhetrf_rook( /** zhetrf_rook's recursive compute kernel */ static void RELAPACK_zhetrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c index a56ad710b..e5033ad49 100644 --- a/relapack/src/zhetrf_rook_rec2.c +++ b/relapack/src/zhetrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, doublecomplex *a, int *lda, int * - ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint * + ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4, z__5; @@ -39,30 +39,30 @@ static int c__1 = 1; doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static double t, r1; static doublecomplex d11, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); static double dtemp, sfmin; - static int itemp, kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + static blasint itemp, kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); extern double dlamch_(char *, ftnlen); static double absakk; - extern /* Subroutine */ int zdscal_(int *, double *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zdscal_(int *, double *, + doublecomplex *, blasint *); static double colmax; - extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *) ; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c index 490dcc82e..14fcd9213 100644 --- a/relapack/src/zlauum.c +++ b/relapack/src/zlauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zlauum_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_zlauum_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_zlauum_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html * */ void RELAPACK_zlauum( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_zlauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZLAUUM", &minfo, strlen("ZLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_zlauum( /** zlauum's recursive compute kernel */ static void RELAPACK_zlauum_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_zlauum_rec( const double ONE[] = { 1., 0. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c index 37e711c9d..fb0e1e97b 100644 --- a/relapack/src/zpbtrf.c +++ b/relapack/src/zpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, - double *, const int *, double *, const int *, int *); +static void RELAPACK_zpbtrf_rec(const char *, const blasint *, const blasint *, + double *, const blasint *, double *, const blasint *, blasint *); /** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html * */ void RELAPACK_zpbtrf( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_zpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZPBTRF", &minfo, strlen("ZPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_zpbtrf( const double ZERO[] = { 0., 0. }; // Allocate work space - const int n1 = ZREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = ZREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; double *Work = malloc(mWork * nWork * 2 * sizeof(double)); LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_zpbtrf( /** zpbtrf's recursive compute kernel */ static void RELAPACK_zpbtrf_rec( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - double *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + double *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_zpbtrf_rec( const double MONE[] = { -1., 0. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(ZREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(ZREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_zpbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, *kd); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c index 411ac5fc0..9259279c1 100644 --- a/relapack/src/zpotrf.c +++ b/relapack/src/zpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zpotrf_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_zpotrf_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_zpotrf_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html * */ void RELAPACK_zpotrf( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_zpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZPOTRF", &minfo, strlen("ZPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_zpotrf( /** zpotrf's recursive compute kernel */ static void RELAPACK_zpotrf_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_zpotrf_rec( const double MONE[] = { -1., 0. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c index 3be21563a..f3412ad8f 100644 --- a/relapack/src/zsytrf.c +++ b/relapack/src/zsytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zsytrf_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html * */ void RELAPACK_zsytrf( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zsytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zsytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zsytrf( /** zsytrf's recursive compute kernel */ static void RELAPACK_zsytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_zsytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_zsytrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_zsytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_zsytrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c index 33902ee9e..ff17267c7 100644 --- a/relapack/src/zsytrf_rec2.c +++ b/relapack/src/zsytrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int * - nb, int *kb, doublecomplex *a, int *lda, int *ipiv, - doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv, + doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2, d__3, d__4; doublecomplex z__1, z__2, z__3; @@ -38,22 +38,22 @@ static int c__1 = 1; void z_div(doublecomplex *, doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k; + static blasint j, k; static doublecomplex t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int zscal_(int *, doublecomplex *, - doublecomplex *, int *); - static int kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zscal_(int *, doublecomplex *, + doublecomplex *, blasint *); + static blasint kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); static double absakk, colmax; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c index c598f7b1e..fc6d73645 100644 --- a/relapack/src/zsytrf_rook.c +++ b/relapack/src/zsytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html * */ void RELAPACK_zsytrf_rook( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zsytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zsytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zsytrf_rook( /** zsytrf_rook's recursive compute kernel */ static void RELAPACK_zsytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zsytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zsytrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zsytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zsytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zsytrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c index 9e111fe0c..4dbf8733a 100644 --- a/relapack/src/zsytrf_rook_rec2.c +++ b/relapack/src/zsytrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, doublecomplex *a, int *lda, int * - ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint * + ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4; @@ -38,26 +38,26 @@ static int c__1 = 1; void z_div(doublecomplex *, doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static doublecomplex t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); static double dtemp, sfmin; - extern /* Subroutine */ int zscal_(int *, doublecomplex *, - doublecomplex *, int *); - static int itemp, kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zscal_(int *, doublecomplex *, + doublecomplex *, blasint *); + static blasint itemp, kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); extern double dlamch_(char *, ftnlen); static double absakk, colmax; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c index 2c8a35256..6a41475e8 100644 --- a/relapack/src/ztgsyl.c +++ b/relapack/src/ztgsyl.c @@ -1,10 +1,10 @@ #include "relapack.h" #include -static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, - const int *, const double *, const int *, const double *, const int *, - double *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, double *, double *, int *); +static void RELAPACK_ztgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const double *, const blasint *, const double *, const blasint *, + double *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, double *, double *, blasint *); /** ZTGSYL solves the generalized Sylvester equation. @@ -14,21 +14,21 @@ static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html * */ void RELAPACK_ztgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "C"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "C"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -57,8 +57,8 @@ void RELAPACK_ztgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZTGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZTGSYL", &minfo, strlen("ZTGSYL")); return; } @@ -74,8 +74,8 @@ void RELAPACK_ztgsyl( // Constant const double ZERO[] = { 0., 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -86,7 +86,7 @@ void RELAPACK_ztgsyl( } double scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; double dscale = 0; @@ -119,13 +119,13 @@ void RELAPACK_ztgsyl( /** ztgsyl's recursive vompute kernel */ static void RELAPACK_ztgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dsum, double *dscale, - int *info + blasint *info ) { if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) { @@ -137,18 +137,18 @@ static void RELAPACK_ztgsyl_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1., 0. }; double scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = ZREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = ZREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -206,8 +206,8 @@ static void RELAPACK_ztgsyl_rec( } } else { // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c index 82b2c8803..567ef115a 100644 --- a/relapack/src/ztrsyl.c +++ b/relapack/src/ztrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, - const int *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, int *); +static void RELAPACK_ztrsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, blasint *); /** ZTRSYL solves the complex Sylvester matrix equation. @@ -12,18 +12,18 @@ static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html * */ void RELAPACK_ztrsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!ctransA && !notransA) *info = -1; @@ -42,8 +42,8 @@ void RELAPACK_ztrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZTRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZTRSYL", &minfo, strlen("ZTRSYL")); return; } @@ -58,11 +58,11 @@ void RELAPACK_ztrsyl( /** ztrsyl's recursive compute kernel */ static void RELAPACK_ztrsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) { @@ -75,18 +75,18 @@ static void RELAPACK_ztrsyl_rec( const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; const double MSGN[] = { -*isgn, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1., 0. }; double scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = ZREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = ZREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -122,8 +122,8 @@ static void RELAPACK_ztrsyl_rec( } } else { // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c index 526ab097c..edc6ffc6b 100644 --- a/relapack/src/ztrsyl_rec2.c +++ b/relapack/src/ztrsyl_rec2.c @@ -14,16 +14,16 @@ #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES -doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { - extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); +doublecomplex zdotu_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) { + extern void zdotu_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *); doublecomplex result; zdotu_(&result, n, x, incx, y, incy); return result; } #define zdotu_ zdotu_fun -doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { - extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); +doublecomplex zdotc_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) { + extern void zdotc_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *); doublecomplex result; zdotc_(&result, n, x, incx, y, incy); return result; @@ -43,7 +43,7 @@ doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) { /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; /** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) * @@ -51,12 +51,12 @@ static int c__1 = 1; * It serves as an unblocked kernel in the recursive algorithms. * */ /* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int - *isgn, int *m, int *n, doublecomplex *a, int *lda, - doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc, - double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) + *isgn, blasint *m, blasint *n, doublecomplex *a, blasint *lda, + doublecomplex *b, blasint *ldb, doublecomplex *c__, blasint *ldc, + double *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4; @@ -66,7 +66,7 @@ static int c__1 = 1; void d_cnjg(doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k, l; + static blasint j, k, l; static doublecomplex a11; static double db; static doublecomplex x11; @@ -74,23 +74,23 @@ static int c__1 = 1; static doublecomplex vec; static double dum[1], eps, sgn, smin; static doublecomplex suml, sumr; - extern int lsame_(char *, char *, ftnlen, ftnlen); + extern blasint lsame_(char *, char *, ftnlen, ftnlen); /* Double Complex */ doublecomplex zdotc_(int *, - doublecomplex *, int *, doublecomplex *, int *), zdotu_( - int *, doublecomplex *, int *, - doublecomplex *, int *); - extern /* Subroutine */ int dlabad_(double *, double *); + doublecomplex *, blasint *, doublecomplex *, blasint *), zdotu_( + blasint *, doublecomplex *, blasint *, + doublecomplex *, blasint *); + extern /* Subroutine */ blasint dlabad_(double *, double *); extern double dlamch_(char *, ftnlen); static double scaloc; - extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); - extern double zlange_(char *, int *, int *, doublecomplex *, - int *, double *, ftnlen); + extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); + extern double zlange_(char *, blasint *, blasint *, doublecomplex *, + blasint *, double *, ftnlen); static double bignum; - extern /* Subroutine */ int zdscal_(int *, double *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zdscal_(int *, double *, + doublecomplex *, blasint *); /* Double Complex */ doublecomplex zladiv_(doublecomplex *, doublecomplex *); - static int notrna, notrnb; + static blasint notrna, notrnb; static double smlnum; /* Parameter adjustments */ diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c index ac9fe7bd4..3f6606d84 100644 --- a/relapack/src/ztrtri.c +++ b/relapack/src/ztrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, - double *, const int *, int *); +static void RELAPACK_ztrtri_rec(const char *, const char *, const blasint *, + double *, const blasint *, blasint *); /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html * */ void RELAPACK_ztrtri( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_ztrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZTRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZTRTRI", &minfo, strlen("ZTRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_ztrtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_ztrtri( /** ztrtri's recursive compute kernel */ static void RELAPACK_ztrtri_rec( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_ztrtri_rec( const double MONE[] = { -1. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR From 0bd956fd21cb1af79ac0c3dfb963bbb1dd8ce384 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 22:49:04 +0200 Subject: [PATCH 537/935] Correct length of name string in xerbla call --- interface/trsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/trsm.c b/interface/trsm.c index f2da285de..715c83a1f 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -204,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, if (side < 0) info = 1; if (info != 0) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1); return; } From 2aad88d5b9ded514d65c257cea818165447e5b78 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 23:01:49 +0200 Subject: [PATCH 538/935] Avoid out-of-bounds accesses in LAPACK EIG tests see https://github.com/Reference-LAPACK/lapack/issues/333 --- lapack-netlib/TESTING/EIG/chet21.f | 3 ++- lapack-netlib/TESTING/EIG/chpt21.f | 2 +- lapack-netlib/TESTING/EIG/zhet21.f | 3 ++- lapack-netlib/TESTING/EIG/zhpt21.f | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/chet21.f b/lapack-netlib/TESTING/EIG/chet21.f index 8dbdb521e..5aff64904 100644 --- a/lapack-netlib/TESTING/EIG/chet21.f +++ b/lapack-netlib/TESTING/EIG/chet21.f @@ -304,7 +304,8 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 +CMK DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL CHER2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/chpt21.f b/lapack-netlib/TESTING/EIG/chpt21.f index 4b9279470..e151a8bd8 100644 --- a/lapack-netlib/TESTING/EIG/chpt21.f +++ b/lapack-netlib/TESTING/EIG/chpt21.f @@ -323,7 +323,7 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL CHPR2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/zhet21.f b/lapack-netlib/TESTING/EIG/zhet21.f index 32a09741e..f6cb2d70a 100644 --- a/lapack-netlib/TESTING/EIG/zhet21.f +++ b/lapack-netlib/TESTING/EIG/zhet21.f @@ -304,7 +304,8 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 +CMK DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL ZHER2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/zhpt21.f b/lapack-netlib/TESTING/EIG/zhpt21.f index f9268661a..ef9e4418d 100644 --- a/lapack-netlib/TESTING/EIG/zhpt21.f +++ b/lapack-netlib/TESTING/EIG/zhpt21.f @@ -323,7 +323,8 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 +CMK DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL ZHPR2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK ) 20 CONTINUE From 11530b76f7b19fbb2d9089ab8166ab54bde8b423 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Apr 2019 09:58:56 +0200 Subject: [PATCH 539/935] Correct INFO=4 condition --- relapack/src/cgetrf.c | 2 +- relapack/src/dgetrf.c | 5 ++--- relapack/src/sgetrf.c | 7 +------ relapack/src/zgetrf.c | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index 9aab718a0..878c9ec15 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -22,7 +22,7 @@ void RELAPACK_cgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index c4bce8fc5..be960fde9 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -15,16 +15,15 @@ void RELAPACK_dgetrf( double *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - // Check arguments *info = 0; if (*m < 0) *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; - if (*info) { + if (*info!=0) { const blasint minfo = -*info; LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF")); return; diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 9d0ff1039..0231cc166 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -1,5 +1,4 @@ #include "relapack.h" - static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); @@ -22,16 +21,14 @@ void RELAPACK_sgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); return; } - const blasint sn = MIN(*m, *n); - RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); // Right remainder @@ -61,7 +58,6 @@ static void RELAPACK_sgetrf_rec( float *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - if (*n <= MAX(CROSSOVER_SGETRF, 1)) { // Unblocked LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); @@ -77,7 +73,6 @@ static void RELAPACK_sgetrf_rec( const blasint n1 = SREC_SPLIT(*n); const blasint n2 = *n - n1; const blasint m2 = *m - n1; - // A_L A_R float *const A_L = A; float *const A_R = A + *ldA * n1; diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index 121b03401..b0d14ffb1 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -22,7 +22,7 @@ void RELAPACK_zgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; From 2cd463eabdcecce01a379c7aaebbb0c48e21c27d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Apr 2019 10:02:28 +0200 Subject: [PATCH 540/935] Disable reallocation of work array in xSYTRF as it appears to cause memory management problems (seen in the LAPACK tests) --- relapack/config.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/relapack/config.h b/relapack/config.h index 9113a712d..e4fab0a12 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -36,8 +36,8 @@ // allow malloc in xsygst for improved performance #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC // allow malloc in xsytrf if the passed work buffer is too small -#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC - +//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC +#define XSYTRF_ALLOW_MALLOC 0 //////////////////////////////// // LAPACK routine replacement // From 1036299da06d4ebd60139529885804fa63400e10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 00:12:37 +0200 Subject: [PATCH 541/935] Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF due to crashes in LAPACK tests --- relapack/src/cgbtrf.c | 4 +++- relapack/src/dgbtrf.c | 6 ++++-- relapack/src/sgbtrf.c | 20 +++++++++++++------- relapack/src/zgbtrf.c | 12 +++++++----- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index eddfdedf7..61332c6a6 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + if (*info) *info += n1; // shift pivots diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index f4b443629..cdf06ad5b 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -1,5 +1,6 @@ #include "relapack.h" -#include "stdlib.h" +#include +#include static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, const blasint *, blasint *); @@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +// RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index 3a4de4ece..3e3fdf455 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -27,7 +27,7 @@ void RELAPACK_sgbtrf( *info = -3; else if (*ku < 0) *info = -4; - else if (*ldAb < 2 * *kl + *ku + 1) + else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { const blasint minfo = -*info; @@ -55,15 +55,16 @@ void RELAPACK_sgbtrf( // Allocate work space const blasint n1 = SREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv ); + const blasint nWorkl = abs( (kv > n1) ? n1 : kv ); + const blasint mWorku = abs( (*kl > n1) ? n1 : *kl ); + const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl ); float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); float *Worku = malloc(mWorku * nWorku * sizeof(float)); LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + // Recursive kernel RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); @@ -81,6 +82,7 @@ static void RELAPACK_sgbtrf_rec( blasint *info ) { + if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { // Unblocked LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); @@ -127,7 +129,7 @@ static void RELAPACK_sgbtrf_rec( float *const A_BR = A + *ldA * n1 + m1; // ipiv_T - // ipiv_B + // ipiv_B blasint *const ipiv_T = ipiv; blasint *const ipiv_B = ipiv + n1; @@ -155,6 +157,7 @@ static void RELAPACK_sgbtrf_rec( float *const A_BRbl = A_BR + m21; float *const A_BRbr = A_BR + *ldA * n21 + m21; + // recursion(Ab_L, ipiv_T) RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); @@ -216,8 +219,11 @@ static void RELAPACK_sgbtrf_rec( } } + // recursion(Ab_BR, ipiv_B) - RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +//cause of infinite recursion here ? +// RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index 0dd3fa7c3..d4ba41753 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -56,10 +56,10 @@ void RELAPACK_zgbtrf( // Allocate work space const blasint n1 = ZREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv); + const blasint nWorkl = abs ( (kv > n1) ? n1 : kv); + const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl); + const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl); double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + // RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + if (*info) *info += n1; // shift pivots From 0f105dd8a5a597b2f468f774a52da226581efbdc Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Sat, 13 Apr 2019 13:56:19 +0000 Subject: [PATCH 542/935] sgemm/strmm --- CONTRIBUTORS.md | 5 +- kernel/power/KERNEL.POWER9 | 6 +- kernel/power/sgemm_kernel_power9.S | 286 ++ kernel/power/sgemm_logic_power9.S | 2133 ++++++++++ kernel/power/sgemm_macros_power9.S | 5828 ++++++++++++++++++++++++++++ param.h | 4 +- 6 files changed, 8256 insertions(+), 6 deletions(-) create mode 100644 kernel/power/sgemm_kernel_power9.S create mode 100644 kernel/power/sgemm_logic_power9.S create mode 100644 kernel/power/sgemm_macros_power9.S diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 08f8cc69d..3859a9c19 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -167,4 +167,7 @@ In chronological order: * [2017-02-26] ztrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13 * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 - + * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes + * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes + * [2019-03-14] power9 dgemm/dtrmm kernel + * [2019-04-29] power9 sgemm/strmm kernel diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 86a931971..6d5cf9068 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -3,16 +3,16 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = strmm_kernel_16x8_power8.S +STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S new file mode 100644 index 000000000..a44659468 --- /dev/null +++ b/kernel/power/sgemm_kernel_power9.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) + +#define M r3 +#define N r4 +#define K r5 + + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs20 +#define save_permute_1 vs21 +#define save_permute_2 vs22 +#define permute_mask vs23 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define T11 r29 + +#define T12 r30 +#define T13 r31 + +#include "sgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_11, 0x1415161718191a1b +.equ save_permute_12, 0x0405060708090a0b +.equ save_permute_21, 0x101112131c1d1e1f +.equ save_permute_22, 0x000102030c0d0e0f + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + slwi LDC, LDC, 2 + + +/* cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 +*/ + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 + + +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + ori T2, T2, perm_const2@higher + rldicr T2, T2, 32, 31 + oris T2, T2, perm_const2@h + ori T2, T2, perm_const2@l + + lis T1, perm_const1@highest + ori T1, T1, perm_const1@higher + rldicr T1, T1, 32, 31 + oris T1, T1, perm_const1@h + ori T1, T1, perm_const1@l + + mtvsrdd permute_mask,T2,T1 + + lis T2, save_permute_12@highest + ori T2, T2, save_permute_12@higher + rldicr T2, T2, 32, 31 + oris T2, T2, save_permute_12@h + ori T2, T2, save_permute_12@l + + lis T1, save_permute_11@highest + ori T1, T1, save_permute_11@higher + rldicr T1, T1, 32, 31 + oris T1, T1, save_permute_11@h + ori T1, T1, save_permute_11@l + + mtvsrdd save_permute_1,T2,T1 + + lis T2, save_permute_22@highest + ori T2, T2, save_permute_22@higher + rldicr T2, T2, 32, 31 + oris T2, T2, save_permute_22@h + ori T2, T2, save_permute_22@l + + lis T1, save_permute_21@highest + ori T1, T1, save_permute_21@higher + rldicr T1, T1, 32, 31 + oris T1, T1, save_permute_21@h + ori T1, T1, save_permute_21@l + + mtvsrdd save_permute_2,T2,T1 + +#include "sgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S new file mode 100644 index 000000000..300e30470 --- /dev/null +++ b/kernel/power/sgemm_logic_power9.S @@ -0,0 +1,2133 @@ +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 3 + + ble LSGEMM_L8_END + +LSGEMM_L8_BEGIN: + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 3 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L8x16_END + + MY_ALIGN +LSGEMM_L8x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO8x16 + ble LSGEMM_L8x16_SUB0 + + MY_ALIGN +LSGEMM_L8x16_LOOP_START: + + LOAD8x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=32 + addi AO,AO,2112 + addi BO,BO,32 + + mtctr L + + MY_ALIGN + +LSGEMM_L8x16_LOOP: + + KERNEL8x16_I1_L4_2 -2048,0, 0,0 + KERNEL8x16_I1_L4_2 -2048,0, 1,0 + KERNEL8x16_I1_L4_2 -2048,0, 2,0 + KERNEL8x16_I1_L4_2 -2048,0, 3,0 + KERNEL8x16_I1_L4_2 -2048,0, 4,0 + KERNEL8x16_I1_L4_2 -2048,0, 5,0 + KERNEL8x16_I1_L4_2 -2048,0, 6,0 + KERNEL8x16_I1_L4_2 -2048,0, 7,0 + KERNEL8x16_I1_L4_2 -2048,0, 8,0 + KERNEL8x16_I1_L4_2 -2048,0, 9,0 + KERNEL8x16_I1_L4_2 -2048,0, 10,0 + KERNEL8x16_I1_L4_2 -2048,0, 11,0 + KERNEL8x16_I1_L4_2 -2048,0, 12,0 + KERNEL8x16_I1_L4_2 -2048,0, 13,0 + KERNEL8x16_I1_L4_2 -2048,0, 14,0 + KERNEL8x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + + END8x16 0, AO, BO, -2048, 0 + + b LSGEMM_L8x16_SUB1 + MY_ALIGN +LSGEMM_L8x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L8x16_SAVE + MY_ALIGN +LSGEMM_L8x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L8x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L8x16_SUB2_LOOP: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_3 64,32, 7,1 + bdnz LSGEMM_L8x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L8x16_SUB2_8 + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_3 64,32, 3,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L8x16_SUB2_4 + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_3 64,32, 1,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L8x16_SUB2_2 + LOAD8x16_0 + KERNEL8x16_I1_L4_3 64,32, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L8x16_SUB2_1 + LOAD8x16_0 + KERNEL8x16_I1_L2_3 64,32, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L8x16_SAVE + KERNEL8x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L8x16_SUB2 + + MY_ALIGN +LSGEMM_L8x16_SAVE: + SAVE8x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L8x16_BEGIN + MY_ALIGN +LSGEMM_L8x16_END: +LSGEMM_L8x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 8 + ble LSGEMM_L8x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x8 + ble LSGEMM_L8x8_SUB0 + + MY_ALIGN +LSGEMM_L8x8_LOOP_START: + + LOAD8x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x8_LOOP: + + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_2 32,32, 1,0 + KERNEL8x8_I1_L4_2 32,32, 2,0 + KERNEL8x8_I1_L4_2 32,32, 3,1 + + bdnz LSGEMM_L8x8_LOOP + + MY_ALIGN +LSGEMM_L8x8_LOOP_END: + + END8x8 0, AO, BO, 32, 32 + + b LSGEMM_L8x8_SUB1 + MY_ALIGN +LSGEMM_L8x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x8_SUB2 + MY_ALIGN +LSGEMM_L8x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x8_SAVE + MY_ALIGN +LSGEMM_L8x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x8_SUB2_LOOP: + LOAD8x8_0 + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_3 32,32, 1,1 + bdnz LSGEMM_L8x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x8_SUB2_2 + LOAD8x8_0 + KERNEL8x8_I1_L4_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x8_SUB2_1 + LOAD8x8_0 + KERNEL8x8_I1_L2_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x8_SAVE + KERNEL8x8 0 + + + MY_ALIGN +LSGEMM_L8x8_SAVE: + SAVE8x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 +#endif + MY_ALIGN +LSGEMM_L8x8_END: +LSGEMM_L8x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 4 + ble LSGEMM_L8x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x4 + ble LSGEMM_L8x4_SUB0 + + MY_ALIGN +LSGEMM_L8x4_LOOP_START: + + LOAD8x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x4_LOOP: + + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_2 16,32, 1,0 + KERNEL8x4_I1_L4_2 16,32, 2,0 + KERNEL8x4_I1_L4_2 16,32, 3,1 + + bdnz LSGEMM_L8x4_LOOP + + MY_ALIGN +LSGEMM_L8x4_LOOP_END: + + END8x4 0, AO, BO, 16, 32 + + b LSGEMM_L8x4_SUB1 + MY_ALIGN +LSGEMM_L8x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x4_SUB2 + MY_ALIGN +LSGEMM_L8x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x4_SAVE + MY_ALIGN +LSGEMM_L8x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x4_SUB2_LOOP: + LOAD8x4_0 + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_3 16,32, 1,1 + bdnz LSGEMM_L8x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x4_SUB2_2 + LOAD8x4_0 + KERNEL8x4_I1_L4_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x4_SUB2_1 + LOAD8x4_0 + KERNEL8x4_I1_L2_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x4_SAVE + KERNEL8x4 0 + + + MY_ALIGN +LSGEMM_L8x4_SAVE: + SAVE8x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 +#endif + MY_ALIGN +LSGEMM_L8x4_END: +LSGEMM_L8x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L8x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x2 + ble LSGEMM_L8x2_SUB0 + + MY_ALIGN +LSGEMM_L8x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x2_LOOP: + + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,0 + KERNEL8x2_2 0,0, 2,0 + KERNEL8x2_2 0,0, 3,1 + + bdnz LSGEMM_L8x2_LOOP + + MY_ALIGN +LSGEMM_L8x2_LOOP_END: + +LSGEMM_L8x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x2_SAVE + MY_ALIGN +LSGEMM_L8x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x2_SUB2_2 + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x2_SUB2_1 + KERNEL8x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x2_SAVE + KERNEL8x2 + + MY_ALIGN +LSGEMM_L8x2_SAVE: + SAVE8x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 +#endif + MY_ALIGN +LSGEMM_L8x2_END: +LSGEMM_L8x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L8x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x1 + ble LSGEMM_L8x1_SUB0 + + MY_ALIGN +LSGEMM_L8x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x1_LOOP: + + KERNEL8x1_4 0,0, 0,0 + KERNEL8x1_4 0,0, 1,1 + + bdnz LSGEMM_L8x1_LOOP + + MY_ALIGN +LSGEMM_L8x1_LOOP_END: + +LSGEMM_L8x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x1_SAVE + MY_ALIGN +LSGEMM_L8x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x1_SUB2_2 + KERNEL8x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x1_SUB2_1 + KERNEL8x1_2 + MY_ALIGN +LSGEMM_L8x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x1_SAVE + KERNEL8x1 + + MY_ALIGN +LSGEMM_L8x1_SAVE: + SAVE8x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 +#endif + MY_ALIGN +LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 8 +#endif + addic. J, J, -1 + bgt LSGEMM_L8_BEGIN + + +LSGEMM_L8_END: + +/* b LSGEMM_L4_BEGIN*/ + andi. T1, N, 4 + ble LSGEMM_L4_END +LSGEMM_L4_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L4x16_END + + MY_ALIGN +LSGEMM_L4x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO4x16 + ble LSGEMM_L4x16_SUB0 + + MY_ALIGN +LSGEMM_L4x16_LOOP_START: + + LOAD4x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=16 + addi AO,AO,2112 + addi BO,BO,16 + + mtctr L + + MY_ALIGN + +LSGEMM_L4x16_LOOP: + + KERNEL4x16_I1_L4_2 -2048,0, 0,0 + KERNEL4x16_I1_L4_2 -2048,0, 1,0 + KERNEL4x16_I1_L4_2 -2048,0, 2,0 + KERNEL4x16_I1_L4_2 -2048,0, 3,0 + KERNEL4x16_I1_L4_2 -2048,0, 4,0 + KERNEL4x16_I1_L4_2 -2048,0, 5,0 + KERNEL4x16_I1_L4_2 -2048,0, 6,0 + KERNEL4x16_I1_L4_2 -2048,0, 7,0 + KERNEL4x16_I1_L4_2 -2048,0, 8,0 + KERNEL4x16_I1_L4_2 -2048,0, 9,0 + KERNEL4x16_I1_L4_2 -2048,0, 10,0 + KERNEL4x16_I1_L4_2 -2048,0, 11,0 + KERNEL4x16_I1_L4_2 -2048,0, 12,0 + KERNEL4x16_I1_L4_2 -2048,0, 13,0 + KERNEL4x16_I1_L4_2 -2048,0, 14,0 + KERNEL4x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L4x16_LOOP + + MY_ALIGN +LSGEMM_L4x16_LOOP_END: + + END4x16 0, AO, BO, -2048, 0 + + b LSGEMM_L4x16_SUB1 + MY_ALIGN +LSGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L4x16_SUB2 + MY_ALIGN +LSGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L4x16_SAVE + MY_ALIGN +LSGEMM_L4x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L4x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L4x16_SUB2_LOOP: + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_2 64,16, 3,0 + KERNEL4x16_I1_L4_2 64,16, 4,0 + KERNEL4x16_I1_L4_2 64,16, 5,0 + KERNEL4x16_I1_L4_2 64,16, 6,0 + KERNEL4x16_I1_L4_3 64,16, 7,1 + bdnz LSGEMM_L4x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_3 64,16, 3,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_3 64,16, 1,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L4_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L4x16_SUB2 + + MY_ALIGN +LSGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L4x16_BEGIN + MY_ALIGN +LSGEMM_L4x16_END: +LSGEMM_L4x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 8 + ble LSGEMM_L4x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x8 + ble LSGEMM_L4x8_SUB0 + + MY_ALIGN +LSGEMM_L4x8_LOOP_START: + + LOAD4x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_2 32,16, 1,0 + KERNEL4x8_I1_L4_2 32,16, 2,0 + KERNEL4x8_I1_L4_2 32,16, 3,1 + + bdnz LSGEMM_L4x8_LOOP + + MY_ALIGN +LSGEMM_L4x8_LOOP_END: + + END4x8 0, AO, BO, 32, 16 + + b LSGEMM_L4x8_SUB1 + MY_ALIGN +LSGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x8_SUB2 + MY_ALIGN +LSGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x8_SAVE + MY_ALIGN +LSGEMM_L4x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x8_SUB2_LOOP: + LOAD4x8_0 + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_3 32,16, 1,1 + bdnz LSGEMM_L4x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L4_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x8_SAVE + KERNEL4x8 0 + + + MY_ALIGN +LSGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 +#endif + MY_ALIGN +LSGEMM_L4x8_END: +LSGEMM_L4x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 4 + ble LSGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x4 + ble LSGEMM_L4x4_SUB0 + + MY_ALIGN +LSGEMM_L4x4_LOOP_START: + + LOAD4x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x4_LOOP: + + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_2 16,16, 1,0 + KERNEL4x4_I1_L4_2 16,16, 2,0 + KERNEL4x4_I1_L4_2 16,16, 3,1 + + bdnz LSGEMM_L4x4_LOOP + + MY_ALIGN +LSGEMM_L4x4_LOOP_END: + + END4x4 0, AO, BO, 16, 16 + + b LSGEMM_L4x4_SUB1 + MY_ALIGN +LSGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x4_SUB2 + MY_ALIGN +LSGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x4_SAVE + MY_ALIGN +LSGEMM_L4x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x4_SUB2_LOOP: + LOAD4x4_0 + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_3 16,16, 1,1 + bdnz LSGEMM_L4x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x4_SUB2_2 + LOAD4x4_0 + KERNEL4x4_I1_L4_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x4_SUB2_1 + LOAD4x4_0 + KERNEL4x4_I1_L2_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x4_SAVE + KERNEL4x4 0 + + + MY_ALIGN +LSGEMM_L4x4_SAVE: + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 +#endif + MY_ALIGN +LSGEMM_L4x4_END: +LSGEMM_L4x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L4x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x2 + ble LSGEMM_L4x2_SUB0 + + MY_ALIGN +LSGEMM_L4x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x2_LOOP: + + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,0 + KERNEL4x2_2 0,0, 2,0 + KERNEL4x2_2 0,0, 3,1 + + bdnz LSGEMM_L4x2_LOOP + + MY_ALIGN +LSGEMM_L4x2_LOOP_END: + +LSGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x2_SAVE + MY_ALIGN +LSGEMM_L4x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x2_SUB2_2 + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x2_SUB2_1 + KERNEL4x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +LSGEMM_L4x2_SAVE: + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 +#endif + MY_ALIGN +LSGEMM_L4x2_END: +LSGEMM_L4x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x1 + ble LSGEMM_L4x1_SUB0 + + MY_ALIGN +LSGEMM_L4x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x1_LOOP: + + KERNEL4x1_4 0,0, 0,0 + KERNEL4x1_4 0,0, 1,1 + + bdnz LSGEMM_L4x1_LOOP + + MY_ALIGN +LSGEMM_L4x1_LOOP_END: + +LSGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x1_SAVE + MY_ALIGN +LSGEMM_L4x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x1_SUB2_2 + KERNEL4x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x1_SUB2_1 + KERNEL4x1_2 + MY_ALIGN +LSGEMM_L4x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +LSGEMM_L4x1_SAVE: + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 +#endif + MY_ALIGN +LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + + andi. T2, N, 3 + ble .L999 + +LSGEMM_L4_END: + andi. T1, N, 2 + ble LSGEMM_L2_END +LSGEMM_L2_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 1 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L2x16_END + + MY_ALIGN +LSGEMM_L2x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x16 + ble LSGEMM_L2x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x16_LOOP: + + KERNEL2x16_4 -2048,0, 0,0 + KERNEL2x16_4 -2048,0, 1,0 + KERNEL2x16_4 -2048,0, 2,0 + KERNEL2x16_4 -2048,0, 3,0 + KERNEL2x16_4 -2048,0, 4,0 + KERNEL2x16_4 -2048,0, 5,0 + KERNEL2x16_4 -2048,0, 6,0 + KERNEL2x16_4 -2048,0, 7,0 + KERNEL2x16_4 -2048,0, 8,0 + KERNEL2x16_4 -2048,0, 9,0 + KERNEL2x16_4 -2048,0, 10,0 + KERNEL2x16_4 -2048,0, 11,0 + KERNEL2x16_4 -2048,0, 12,0 + KERNEL2x16_4 -2048,0, 13,0 + KERNEL2x16_4 -2048,0, 14,0 + KERNEL2x16_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x16_SAVE + MY_ALIGN +LSGEMM_L2x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x16_SUB2_16 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,0 + KERNEL2x16_4 0,0, 4,0 + KERNEL2x16_4 0,0, 5,0 + KERNEL2x16_4 0,0, 6,0 + KERNEL2x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x16_SUB2_8 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x16_SUB2_4 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x16_SUB2_2 + KERNEL2x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x16_SUB2_1 + KERNEL2x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x16_SAVE + KERNEL2x16 + + MY_ALIGN +LSGEMM_L2x16_SAVE: + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L2x16_BEGIN + MY_ALIGN +LSGEMM_L2x16_END: + andi. I, M, 8 + ble LSGEMM_L2x8_END + + MY_ALIGN +LSGEMM_L2x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x8 + ble LSGEMM_L2x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x8_LOOP: + + KERNEL2x8_4 -2048,0, 0,0 + KERNEL2x8_4 -2048,0, 1,0 + KERNEL2x8_4 -2048,0, 2,0 + KERNEL2x8_4 -2048,0, 3,0 + KERNEL2x8_4 -2048,0, 4,0 + KERNEL2x8_4 -2048,0, 5,0 + KERNEL2x8_4 -2048,0, 6,0 + KERNEL2x8_4 -2048,0, 7,0 + KERNEL2x8_4 -2048,0, 8,0 + KERNEL2x8_4 -2048,0, 9,0 + KERNEL2x8_4 -2048,0, 10,0 + KERNEL2x8_4 -2048,0, 11,0 + KERNEL2x8_4 -2048,0, 12,0 + KERNEL2x8_4 -2048,0, 13,0 + KERNEL2x8_4 -2048,0, 14,0 + KERNEL2x8_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x8_SAVE + MY_ALIGN +LSGEMM_L2x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x8_SUB2_16 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,0 + KERNEL2x8_4 0,0, 4,0 + KERNEL2x8_4 0,0, 5,0 + KERNEL2x8_4 0,0, 6,0 + KERNEL2x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x8_SUB2_8 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x8_SUB2_4 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x8_SUB2_2 + KERNEL2x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +LSGEMM_L2x8_SAVE: + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 +#endif + MY_ALIGN +LSGEMM_L2x8_END: + andi. I, M, 4 + ble LSGEMM_L2x4_END + + MY_ALIGN +LSGEMM_L2x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x4 + ble LSGEMM_L2x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x4_LOOP: + + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,0 + KERNEL2x4_4 0,0, 8,0 + KERNEL2x4_4 0,0, 9,0 + KERNEL2x4_4 0,0, 10,0 + KERNEL2x4_4 0,0, 11,0 + KERNEL2x4_4 0,0, 12,0 + KERNEL2x4_4 0,0, 13,0 + KERNEL2x4_4 0,0, 14,0 + KERNEL2x4_4 0,0, 15,1 + + bdnz LSGEMM_L2x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x4_SAVE + MY_ALIGN +LSGEMM_L2x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x4_SUB2_16 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x4_SUB2_8 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x4_SUB2_4 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x4_SUB2_2 + KERNEL2x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x4_SAVE + KERNEL2x4 + + MY_ALIGN +LSGEMM_L2x4_SAVE: + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 +#endif + MY_ALIGN +LSGEMM_L2x4_END: + andi. I, M, 2 + ble LSGEMM_L2x2_END + + MY_ALIGN +LSGEMM_L2x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x2 + ble LSGEMM_L2x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x2_LOOP: + + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,0 + KERNEL2x2_4 0,0, 8,0 + KERNEL2x2_4 0,0, 9,0 + KERNEL2x2_4 0,0, 10,0 + KERNEL2x2_4 0,0, 11,0 + KERNEL2x2_4 0,0, 12,0 + KERNEL2x2_4 0,0, 13,0 + KERNEL2x2_4 0,0, 14,0 + KERNEL2x2_4 0,0, 15,1 + + bdnz LSGEMM_L2x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x2_SAVE + MY_ALIGN +LSGEMM_L2x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x2_SUB2_16 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x2_SUB2_8 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x2_SUB2_4 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x2_SUB2_2 + KERNEL2x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +LSGEMM_L2x2_SAVE: + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 +#endif + MY_ALIGN +LSGEMM_L2x2_END: + andi. I, M, 1 + ble LSGEMM_L2x1_END + + MY_ALIGN +LSGEMM_L2x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x1 + ble LSGEMM_L2x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x1_LOOP: + + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,0 + KERNEL2x1_4 0,0, 8,0 + KERNEL2x1_4 0,0, 9,0 + KERNEL2x1_4 0,0, 10,0 + KERNEL2x1_4 0,0, 11,0 + KERNEL2x1_4 0,0, 12,0 + KERNEL2x1_4 0,0, 13,0 + KERNEL2x1_4 0,0, 14,0 + KERNEL2x1_4 0,0, 15,1 + + bdnz LSGEMM_L2x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x1_SAVE + MY_ALIGN +LSGEMM_L2x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x1_SUB2_16 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x1_SUB2_8 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x1_SUB2_4 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x1_SUB2_2 + KERNEL2x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x1_SUB2_1 + KERNEL2x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +LSGEMM_L2x1_SAVE: + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 +#endif + MY_ALIGN +LSGEMM_L2x1_END: + slwi T1, K, 3 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LSGEMM_L2_END: + andi. T1, N, 1 + ble LSGEMM_END +LSGEMM_1_BEGIN: + + + mr AO, A + mr CO, C + add C, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_1x16_END + + MY_ALIGN +LSGEMM_1x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x16 + ble LSGEMM_1x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x16_LOOP: + + KERNEL1x16_4 -2048,0, 0,0 + KERNEL1x16_4 -2048,0, 1,0 + KERNEL1x16_4 -2048,0, 2,0 + KERNEL1x16_4 -2048,0, 3,0 + KERNEL1x16_4 -2048,0, 4,0 + KERNEL1x16_4 -2048,0, 5,0 + KERNEL1x16_4 -2048,0, 6,0 + KERNEL1x16_4 -2048,0, 7,0 + KERNEL1x16_4 -2048,0, 8,0 + KERNEL1x16_4 -2048,0, 9,0 + KERNEL1x16_4 -2048,0, 10,0 + KERNEL1x16_4 -2048,0, 11,0 + KERNEL1x16_4 -2048,0, 12,0 + KERNEL1x16_4 -2048,0, 13,0 + KERNEL1x16_4 -2048,0, 14,0 + KERNEL1x16_4 -2048,0, 15,1 + + bdnz LSGEMM_1x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x16_SAVE + MY_ALIGN +LSGEMM_1x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x16_SUB2_16 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,0 + KERNEL1x16_4 0,0, 4,0 + KERNEL1x16_4 0,0, 5,0 + KERNEL1x16_4 0,0, 6,0 + KERNEL1x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x16_SUB2_8 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x16_SUB2_4 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x16_SUB2_2 + KERNEL1x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x16_SUB2_1 + KERNEL1x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x16_SAVE + KERNEL1x16 + + MY_ALIGN +LSGEMM_1x16_SAVE: + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt+ LSGEMM_1x16_BEGIN + MY_ALIGN +LSGEMM_1x16_END: + andi. I, M, 8 + ble LSGEMM_1x8_END + + MY_ALIGN +LSGEMM_1x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x8 + ble LSGEMM_1x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x8_LOOP: + + KERNEL1x8_4 -2048,0, 0,0 + KERNEL1x8_4 -2048,0, 1,0 + KERNEL1x8_4 -2048,0, 2,0 + KERNEL1x8_4 -2048,0, 3,0 + KERNEL1x8_4 -2048,0, 4,0 + KERNEL1x8_4 -2048,0, 5,0 + KERNEL1x8_4 -2048,0, 6,0 + KERNEL1x8_4 -2048,0, 7,0 + KERNEL1x8_4 -2048,0, 8,0 + KERNEL1x8_4 -2048,0, 9,0 + KERNEL1x8_4 -2048,0, 10,0 + KERNEL1x8_4 -2048,0, 11,0 + KERNEL1x8_4 -2048,0, 12,0 + KERNEL1x8_4 -2048,0, 13,0 + KERNEL1x8_4 -2048,0, 14,0 + KERNEL1x8_4 -2048,0, 15,1 + + bdnz LSGEMM_1x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x8_SAVE + MY_ALIGN +LSGEMM_1x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x8_SUB2_16 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,0 + KERNEL1x8_4 0,0, 4,0 + KERNEL1x8_4 0,0, 5,0 + KERNEL1x8_4 0,0, 6,0 + KERNEL1x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x8_SUB2_8 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x8_SUB2_4 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x8_SUB2_2 + KERNEL1x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x8_SUB2_1 + KERNEL1x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x8_SAVE + KERNEL1x8 + + MY_ALIGN +LSGEMM_1x8_SAVE: + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 +#endif + MY_ALIGN +LSGEMM_1x8_END: + andi. I, M, 4 + ble LSGEMM_1x4_END + + MY_ALIGN +LSGEMM_1x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x4 + ble LSGEMM_1x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x4_LOOP: + + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,0 + KERNEL1x4_4 0,0, 8,0 + KERNEL1x4_4 0,0, 9,0 + KERNEL1x4_4 0,0, 10,0 + KERNEL1x4_4 0,0, 11,0 + KERNEL1x4_4 0,0, 12,0 + KERNEL1x4_4 0,0, 13,0 + KERNEL1x4_4 0,0, 14,0 + KERNEL1x4_4 0,0, 15,1 + + bdnz LSGEMM_1x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x4_SAVE + MY_ALIGN +LSGEMM_1x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x4_SUB2_16 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x4_SUB2_8 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x4_SUB2_4 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x4_SUB2_2 + KERNEL1x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x4_SUB2_1 + KERNEL1x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x4_SAVE + KERNEL1x4 + + MY_ALIGN +LSGEMM_1x4_SAVE: + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 +#endif + MY_ALIGN +LSGEMM_1x4_END: + andi. I, M, 2 + ble LSGEMM_1x2_END + + MY_ALIGN +LSGEMM_1x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x2 + ble LSGEMM_1x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x2_LOOP: + + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,0 + KERNEL1x2_4 0,0, 8,0 + KERNEL1x2_4 0,0, 9,0 + KERNEL1x2_4 0,0, 10,0 + KERNEL1x2_4 0,0, 11,0 + KERNEL1x2_4 0,0, 12,0 + KERNEL1x2_4 0,0, 13,0 + KERNEL1x2_4 0,0, 14,0 + KERNEL1x2_4 0,0, 15,1 + + bdnz LSGEMM_1x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x2_SAVE + MY_ALIGN +LSGEMM_1x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x2_SUB2_16 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x2_SUB2_8 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x2_SUB2_4 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x2_SUB2_2 + KERNEL1x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x2_SUB2_1 + KERNEL1x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x2_SAVE + KERNEL1x2 + + MY_ALIGN +LSGEMM_1x2_SAVE: + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 +#endif + MY_ALIGN +LSGEMM_1x2_END: + andi. I, M, 1 + ble LSGEMM_1x1_END + + MY_ALIGN +LSGEMM_1x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x1 + ble LSGEMM_1x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x1_LOOP: + + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,0 + KERNEL1x1_16 0,0, 2,0 + KERNEL1x1_16 0,0, 3,1 + + bdnz LSGEMM_1x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x1_SAVE + MY_ALIGN +LSGEMM_1x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x1_SUB2_16 + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,1 + MY_ALIGN +LSGEMM_1x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x1_SUB2_8 + KERNEL1x1_16 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x1_SUB2_4 + KERNEL1x1_8 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x1_SUB2_2 + KERNEL1x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x1_SUB2_1 + KERNEL1x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x1_SAVE + KERNEL1x1 + + MY_ALIGN +LSGEMM_1x1_SAVE: + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 +#endif + MY_ALIGN +LSGEMM_1x1_END: + slwi T1, K, 2 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LSGEMM_END: \ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S new file mode 100644 index 000000000..c61f419ac --- /dev/null +++ b/kernel/power/sgemm_macros_power9.S @@ -0,0 +1,5828 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 4 +#define DISP64(ind,disp) (ind*unit_size*64+disp) +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + LOAD8x16 1 +.endm + +.macro LOAD8x16_0 + LOAD8x16 0 +.endm + +.macro KERNEL8x16_L1_L4 Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD8x16 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + +.macro END8x16_NORMAL + END8x16 0, AO, BO, 64,32 +.endm + +.macro END8x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.endm + +.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + +.endm + +.macro KERNEL8x16 First + + LOAD8x16 0 + END8x16 \First, AO, BO, 64,32 +.endm + +.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + xvmulsp vs50, vs6,vs12 + xvmulsp vs51, vs7,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + xvmulsp vs54, vs6,vs13 + xvmulsp vs55, vs7,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + xvmulsp vs58, vs6,vs14 + xvmulsp vs59, vs7,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + xvmulsp vs62, vs6,vs15 + xvmulsp vs63, vs7,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 + +.endif + +.endm + + +.macro SAVE8x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + + + + /* permute to restore butterfly rank 1 updateto normal promoted one */ + /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ + /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ + /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ + /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + /*****the same with the second 8X8 ****/ +#ifndef TRMMKERNEL + + lxv vs32, 0(T4) + lxv vs33, 16(T4) + lxv vs34, 32(T4) + lxv vs35, 48(T4) + lxv vs36, 0(T5) + lxv vs37, 16(T5) + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) + lxv vs42, 32(T6) + lxv vs43, 48(T6) + lxv vs44, 0(T7) + lxv vs45, 16(T7) + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + + xxmrglw vs16, vs50, vs62 + xxmrglw vs18, vs54, vs58 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + xxmrghw vs4, vs54, vs58 + xxmrghw vs5, vs50, vs62 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs51, vs63 + xxmrglw vs26, vs55, vs59 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + xxmrghw vs30, vs55, vs59 + xxmrghw vs31, vs51, vs63 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + stxv vs32, 0(T4) + stxv vs33, 16(T4) + stxv vs34, 32(T4) + stxv vs35, 48(T4) + + stxv vs36, 0(T5) + stxv vs37, 16(T5) + stxv vs38, 32(T5) + stxv vs39, 48(T5) + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs40, 0(T6) + stxv vs41, 16(T6) + stxv vs42, 32(T6) + stxv vs43, 48(T6) + stxv vs44, 0(T7) + stxv vs45, 16(T7) + stxv vs46, 32(T7) + stxv vs47, 48(T7) + + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + LOAD8x8 1 +.endm + +.macro LOAD8x8_0 + LOAD8x8 0 +.endm + +.macro KERNEL8x8_L1_L4 Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END8x8_NORMAL + END8x8 0, AO, BO, 32,32 +.endm + +.macro Zero8X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + +.endm + +.macro LOAD8x8 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endif +.endm + + +.macro END8x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.endm + +.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endm + +.macro KERNEL8x8 First + + LOAD8x8 0 + END8x8 \First, AO, BO, 32,32 +.endm + +.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endif + +.endm + + +.macro SAVE8x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + lxv vs50, 0(T4) + lxv vs51, 16(T4) + lxv vs54, 0(T5) + lxv vs55, 16(T5) + lxv vs58, 0(T6) + lxv vs59, 16(T6) + lxv vs62, 0(T7) + lxv vs63, 16(T7) +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + stxv vs34, 0(CO) + stxv vs35, 16(CO) + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + stxv vs38, 0(T1) + stxv vs39, 16(T1) + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + stxv vs42, 0(T2) + stxv vs43, 16(T2) + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + stxv vs46, 0(T3) + stxv vs47, 16(T3) + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + #ifdef TRMMKERNEL + xvmulsp vs50, vs8, alpha_r + xvmulsp vs51, vs12, alpha_r + xvmulsp vs54, vs9, alpha_r + xvmulsp vs55, vs13, alpha_r + xvmulsp vs58, vs10, alpha_r + xvmulsp vs59, vs14, alpha_r + xvmulsp vs62, vs11, alpha_r + xvmulsp vs63, vs15, alpha_r +#else + xvmaddasp vs50, vs8, alpha_r + xvmaddasp vs51, vs12, alpha_r + xvmaddasp vs54, vs9, alpha_r + xvmaddasp vs55, vs13, alpha_r + xvmaddasp vs58, vs10, alpha_r + xvmaddasp vs59, vs14, alpha_r + xvmaddasp vs62, vs11, alpha_r + xvmaddasp vs63, vs15, alpha_r +#endif + + stxv vs50, 0(T4) + stxv vs51, 16(T4) + stxv vs54, 0(T5) + stxv vs55, 16(T5) + stxv vs58, 0(T6) + stxv vs59, 16(T6) + stxv vs62, 0(T7) + stxv vs63, 16(T7) + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + LOAD8x4 1 +.endm + +.macro LOAD8x4_0 + LOAD8x4 0 +.endm + +.macro KERNEL8x4_L1_L4 Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + +.endm + +.macro LOAD8x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + lxv vs25, 16(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 +.endif +.endm + +.macro END8x4_NORMAL + END8x4 0, AO, BO, 16,32 +.endm + +.macro END8x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.endif +.endm + +.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + +.macro KERNEL8x4 First + LOAD8x4 0 + END8x4 \First, AO, BO, 16,32 +.endm + +.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + xvmulsp vs48, vs27, vs4 + xvmulsp vs49, vs27, vs5 + xvmulsp vs50, vs27, vs6 + xvmulsp vs51, vs27, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + + +.macro SAVE8x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + add T4, T2, T10 + add T5, T3, T10 +#if !defined(TRMMKERNEL) + lxv vs40, 0(T4) + lxv vs41, 0(T5) +#endif + add T6, T4, T10 + add T7, T5, T10 +#if !defined(TRMMKERNEL) + lxv vs42, 0(T6) + lxv vs43, 0(T7) +#endif + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + + xxmrglw vs0, vs51,vs48 + xxmrglw vs1, vs50,vs49 + xxmrglw vs4, vs48,vs51 + xxmrglw vs5, vs49,vs50 + + xxmrghw vs2, vs51,vs48 + xxmrghw vs3, vs50,vs49 + xxmrghw vs6, vs48,vs51 + xxmrghw vs7, vs49,vs50 + + xxmrgld vs28, vs1, vs0 + xxmrghd vs29,vs5,vs4 + + xxmrgld vs30, vs2, vs3 + xxmrghd vs31,vs6,vs7 +#if defined(TRMMKERNEL) + + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r + xvmulsp vs40, vs28, alpha_r + xvmulsp vs41, vs29, alpha_r + xvmulsp vs42, vs30, alpha_r + xvmulsp vs43, vs31, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + xvmaddasp vs40, vs28, alpha_r + xvmaddasp vs41, vs29, alpha_r + xvmaddasp vs42, vs30, alpha_r + xvmaddasp vs43, vs31, alpha_r +#endif + + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + stxv vs40, 0(T4) + stxv vs41, 0(T5) + stxv vs42, 0(T6) + stxv vs43, 0(T7) + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + + +.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero8x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + +.macro KERNEL8x2 + KERNEL8x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP8(\Index,32) + +.endm + +.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs1, vs29, vs10 + xvmulsp vs2, vs28, vs11 + xvmulsp vs3, vs29, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs1, vs29, vs10 + xvmaddasp vs2, vs28, vs11 + xvmaddasp vs3, vs29, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE8x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + lxssp v8,0(T4) + lxssp v9,4(T4) + + lxssp v10,0(T5) + lxssp v11,4(T5) + + lxssp v12,0(T6) + lxssp v13,4(T6) + + lxssp v14,0(T7) + lxssp v15,4(T7) +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + xscvspdp vs9, vs3 + xxspltw vs10, vs3, 1 + xxspltw vs11, vs3, 2 + xxspltw vs12, vs3, 3 + xscvspdp vs10,vs10 + xscvspdp vs11,vs11 + xscvspdp vs12,vs12 + + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 + + + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + xsmuldp vs40,vs12, vs4 + xsmuldp vs41,vs31, vs4 + + xsmuldp vs42,vs11, vs4 + xsmuldp vs43,vs30, vs4 + + xsmuldp vs44,vs10, vs4 + xsmuldp vs45,vs29, vs4 + + xsmuldp vs46,vs9, vs4 + xsmuldp vs47,vs28, vs4 +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + xsmaddadp vs40,vs12, vs4 + xsmaddadp vs41,vs31, vs4 + + xsmaddadp vs42,vs11, vs4 + xsmaddadp vs43,vs30, vs4 + + xsmaddadp vs44,vs10, vs4 + xsmaddadp vs45,vs29, vs4 + + xsmaddadp vs46,vs9, vs4 + xsmaddadp vs47,vs28, vs4 +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + stxssp v8,0(T4) + stxssp v9,4(T4) + + stxssp v10,0(T5) + stxssp v11,4(T5) + + stxssp v12,0(T6) + stxssp v13,4(T6) + + stxssp v14,0(T7) + stxssp v15,4(T7) + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ +.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero8x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + +.macro KERNEL8x1 + KERNEL8x1_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_2 + KERNEL8x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL8x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) + lxv vs28, 32(\BREG) + lxv vs29, 48(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 64 +.endm + +.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) + lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) + lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) + lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) + lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs1, vs31, vs10 + xvmulsp vs0, vs32, vs11 + xvmulsp vs1, vs33, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs0, vs32, vs11 + xvmaddasp vs1, vs33, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP32(\Index,128) +.endif +.endm + +.macro SAVE8x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) + lxssp v8,0(T4) + lxssp v10,0(T5) + lxssp v12,0(T6) + lxssp v14,0(T7) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 + xsmuldp vs40,vs31, vs4 + xsmuldp vs42,vs30, vs4 + xsmuldp vs44,vs29, vs4 + xsmuldp vs46,vs28, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 + xsmaddadp vs40,vs31, vs4 + xsmaddadp vs42,vs30, vs4 + xsmaddadp vs44,vs29, vs4 + xsmaddadp vs46,vs28, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + stxssp v8,0(T4) + stxssp v10,0(T5) + stxssp v12,0(T6) + stxssp v14,0(T7) + addi CO,CO,4 +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm + +.macro KERNEL4x16_L1_L4 Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + +.endif +.endm + +.macro END4x16_NORMAL + END4x16 0, AO, BO, 64,16 +.endm + +.macro END4x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + +.endif +.endm + +.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endm + +.macro KERNEL4x16 First + + LOAD4x16 0 + END4x16 \First, AO, BO, 64,16 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endif + +.endm + + +.macro SAVE4x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm + +.macro KERNEL4x8_L1_L4 Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END4x8_NORMAL + END4x8 0, AO, BO, 32,16 +.endm + +.macro Zero4X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endm + +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endif +.endm + + +.macro END4x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.endm + +.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + +.endm + +.macro KERNEL4x8 First + + LOAD4x8 0 + END4x8 \First, AO, BO, 32,16 +.endm + +.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + +.endif + +.endm + + +.macro SAVE4x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + stxv vs34, 0(CO) + stxv vs35, 16(CO) + stxv vs38, 0(T1) + stxv vs39, 16(T1) + stxv vs42, 0(T2) + stxv vs43, 16(T2) + stxv vs46, 0(T3) + stxv vs47, 16(T3) + + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + LOAD4x4 1 +.endm + +.macro LOAD4x4_0 + LOAD4x4 0 +.endm + +.macro KERNEL4x4_L1_L4 Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + +.macro LOAD4x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endif +.endm + +.macro END4x4_NORMAL + END4x4 0, AO, BO, 16,16 +.endm + +.macro END4x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.endif +.endm + +.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + +.macro KERNEL4x4 First + LOAD4x4 0 + END4x4 \First, AO, BO, 16,16 +.endm + +.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) + +.endif +.endif + + +.endm + + +.macro SAVE4x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + #if defined(TRMMKERNEL) + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + #endif + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + + +.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero4x2 + xxlxor vs0, vs0, vs0 + xxlxor vs2, vs2, vs2 + +.endm + +.macro KERNEL4x2 + KERNEL4x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP4(\Index,16) + +.endm + +.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs2, vs28, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs2, vs28, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE4x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ +.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero4x1 + xxlxor vs0, vs0, vs0 +.endm + +.macro KERNEL4x1 + KERNEL4x1_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_2 + KERNEL4x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 16 +.endm + +.macro KERNEL4x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs28, 16(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs0, vs32, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs0, vs32, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif +.endm + +.macro SAVE4x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + addi CO,CO,4 +.endm + +/****************************N=2 section*****************/ + +.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 +.endm + +.macro KERNEL2x16 + KERNEL2x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + xvmulsp vs6, vs28, vs9 + xvmulsp vs7, vs29, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs2, vs32, vs12 + xvmaddasp vs3, vs33, vs12 + + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + xvmaddasp vs6, vs32, vs13 + xvmaddasp vs7, vs33, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs2, vs36, vs14 + xvmaddasp vs3, vs37, vs14 + + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + xvmaddasp vs6, vs36, vs15 + xvmaddasp vs7, vs37, vs15 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE2x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + lxv vs28, 32(T1) + lxv vs29, 48(T1) +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r + xvmulsp vs28, vs6, alpha_r + xvmulsp vs29, vs7, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r + xvmaddasp vs28, vs6, alpha_r + xvmaddasp vs29, vs7, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + stxv vs28, 32(T1) + stxv vs29, 48(T1) + + addi CO,CO,64 + +.endm + +/* M=8 N=2 */ + +.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x8 + KERNEL2x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE2x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + + addi CO,CO,32 + +.endm + + +/*M=4*/ + + +.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + /* we will aggregate on save vs0 +vs4 vs11+vs5 */ +.macro Zero2x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x4 + KERNEL2x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs30, vs13 + xvmaddasp vs4, vs34, vs14 + xvmaddasp vs5, vs34, vs15 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE2x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + +#endif + /*aggregate vectors*/ + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs26, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs26, vs1, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs26, 0(T1) + + addi CO,CO,16 + +.endm + + +/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ +.macro SWITCH_PERMUTE_INNER + xxpermdi permute_mask, permute_mask, permute_mask,2 +.endm + +.macro Zero2x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + SWITCH_PERMUTE_INNER +.endm + +.macro KERNEL2x2 + KERNEL2x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxperm vs9, vs36, permute_mask + lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs37, vs36 + xvmulsp vs1, vs37, vs9 + +.else + xvmaddasp vs0, vs37, vs36 + xvmaddasp vs1, vs37, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP2(\Index,8) + +.endm + + + + +.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + xxperm vs11, vs10, permute_mask + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs16, vs11 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + +.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP4(\Index,16) +.endif +.endm + + +.macro SAVE2x2 + +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) + +#endif + /*aggregate vectors*/ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + /* */ + /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ + xxperm vs1,vs1, permute_mask + + + xxmrghw vs2 ,vs1,vs0 + xxpermdi vs2,vs2,vs2,2 + xxmrghw vs3 ,vs0,vs1 +#if defined(TRMMKERNEL) + xvmulsp vs36, vs2, alpha_r + xvmulsp vs37, vs3, alpha_r +#else + xvmaddasp vs36, vs2, alpha_r + xvmaddasp vs37, vs3, alpha_r +#endif + /**** store last two words*/ + + + stxsd v4, 0(CO) + stxsd v5, 0(T1) + + addi CO,CO,8 + +.endm + +/*--------------------------- M=1 N=2 */ +.macro Zero2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL2x1 + KERNEL2x1_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs2, vs37, vs35 + xvmulsp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP2(\Index,8) +.endm + + +.macro SAVE2x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxssp v5 , 0(T1) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 2x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 0(T1) + + addi CO,CO,4 + +.endm + + + +/****************************N=1 section*****************/ + +.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x16 + KERNEL1x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs2, vs32, vs10 + xvmaddasp vs3, vs33, vs10 + + + xvmaddasp vs0, vs34, vs11 + xvmaddasp vs1, vs35, vs11 + xvmaddasp vs2, vs36, vs11 + xvmaddasp vs3, vs37, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE1x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + addi CO,CO,64 + +.endm + +/* M=8 N=1 */ + +.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x8 + KERNEL1x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + + + xvmaddasp vs2, vs34, vs11 + xvmaddasp vs3, vs35, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE1x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + /* aggregate vs0 vs2 and vs1 vs3*/ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + addi CO,CO,32 + +.endm +/*M=4*/ + +.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x4 + KERNEL1x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + + xvmaddasp vs1, vs27, vs9 + + xvmaddasp vs2, vs30, vs10 + + + xvmaddasp vs3, vs31, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE1x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + /* aggregate */ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 + xvaddsp vs0,vs1,vs0 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r +#endif + stxv vs16, 0(CO) + + addi CO,CO,16 + +.endm + +/* M=2 N=1*/ +.macro Zero1x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL1x2 + KERNEL1x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs2, vs37, vs35 + xvmuldp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x2 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + lxssp v5 , 4(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 1x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 4(CO) + + addi CO,CO,8 + +.endm +/*///////////////// N=1 M=1 //////////////////*/ +.macro Zero1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2,vs2 + xxlxor vs3,vs3,vs3 + xxlxor vs4,vs4,vs4 +.endm + +.macro KERNEL1x1 + KERNEL1x1_1 AO,BO, 1, 0,0,0 +.endm + +.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone ( FIRST==1 to zero vs4) + */ +.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs4, vs37, vs35 + +.else + xsmaddadp vs4, vs37, vs35 + .endif + + addi \AREG, \AREG, DISP1(\Index,4) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + +.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) + lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) + lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) + lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) + lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + xvmaddasp vs2, vs10, vs17 + xvmaddasp vs3, vs11, vs18 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs8, vs26 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) + lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs36, vs37 + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors */ + xvaddsp vs0,vs0,vs1 + xvaddsp vs2,vs2,vs3 + xvaddsp vs0,vs0,vs2 + + xxpermdi vs7,vs0,vs0,2 + xvaddsp vs0,vs0,vs7 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs7,vs5,vs6 + xsadddp vs4,vs4,vs7 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs4, vs16 + +#else + xsmaddadp vs36,vs4, vs16 +#endif + + stxssp v4, 0(CO) + + addi CO,CO,4 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 3 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 2 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/param.h b/param.h index 938a82a9e..d59cb1656 100644 --- a/param.h +++ b/param.h @@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 1280 +#define SGEMM_DEFAULT_P 640 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 320 -#define SGEMM_DEFAULT_Q 640 +#define SGEMM_DEFAULT_Q 1408 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640 From 9763f872fcb841a00926f31c801bfd007a5337b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:18:26 +0200 Subject: [PATCH 543/935] Update Changelog with changes from 0.3.6 --- Changelog.txt | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 49b26873a..8df35d5c3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,82 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.6 +29-Apr-2019 + +common: + * the build tools now check that a given cpu TARGET is actually valid + * the build-time check of system features (c_check) has been made + less dependent on particular perl features (this should mainly + benefit building on Windows) + * several problem with the ReLAPACK integration were fixed, + including INTERFACE64 support and building a shared library + * building with CMAKE on BSD systems was improved + * a non-absolute SUM function was added based on the + existing optimized code for ASUM + * CBLAS interfaces to the IxMIN and IxMAX functions were added + * a name clash between LAPACKE and BOOST headers was resolved + * CMAKE builds with OpenMP failed to include the appropriate getrf_parallel + kernels + * a crash on thread (key) deletion with the USE_TLS=1 memory management + option was fixed + * restored several earlier fixes, in particular for OpenMP performance, + building on BSD, and calling fork on CYGWIN, which had inadvertently + been dropped in the 0.3.3 rewrite of the memory management code. + +x86_64: + * the AVX512 DGEMM kernel has been disabled again due to unsolved problems + * building with old versions of MSVC was fixed + * it is now possible to build a static library on Windows with CMAKE + * accessing environment variables on CYGWIN at run time was fixed + * the CMAKE build system now recognizes 32bit userspace on 64bit hardware + * Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected + * building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported + with CMAKE as well + * building for DYNAMIC_ARCH with GENERIC as the default target is now supported + * a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed + * assembly bugs involving undeclared modification of input operands were fixed + in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, + Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause + test failures or segfaults when compiled with recent versions of gcc from 8 onward. + * a similar bug was fixed in the blas_quickdivide code used to split workloads + in most functions + * a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX + * fixed building on SkylakeX systems when either the compiler or the (emulated) operating + environment does not support AVX512 + * improved GEMM performance on ZEN targets + +x86: + * build failures caused by the recently added checks for AVX512 were fixed + * an inline assembly bug involving undeclared modification of an input argument was + fixed in the blas_quickdivide code used to split workloads in most functions + * a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX + +MIPS32: + * a bug in the IMIN implementation made it return the result of IMAX + +POWER: + * single precision BLAS1/2 functions have received optimized POWER8 kernels + * POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel + * building on PPC970 systems under OSX Leopard or Tiger is now supported + * out-of-bounds memory accesses in the gemm_beta microkernels were fixed + * building a shared library on AIX is now supported for POWER6 + * DYNAMIC_ARCH support has been added for POWER6 and newer + +ARMv7: + * corrected xDOT behaviour with zero INC_X or INC_Y + * a bug in the IMIN implementation made it return the result of IMAX + +ARMv8: + * added support for HiSilicon TSV110 cpus + * the CMAKE build system now recognizes 32bit userspace on 64bit hardware + * cross-compilation with CMAKE now works again + * a bug in the IMIN implementation made it return the result of IMAX + * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 + +IBM Z: + * optimized microkernels for single precicion BLAS1/2 functions have been added + for both Z13 and Z14 + ==================================================================== Version 0.3.5 31-Dec-2018 From bfeb9c16b0011f4f5f508a6d6df18017ab28f95a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:24:53 +0200 Subject: [PATCH 544/935] Increment version to 0.3.7.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 969696179..8900973a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 6) +set(OpenBLAS_PATCH_VERSION 7.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 4f8143b098418487b261653b48b16dc71cc2a259 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:25:32 +0200 Subject: [PATCH 545/935] Increment version to 0.3.7.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 21782a2b9..b46479d03 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.6 +VERSION = 0.3.7.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From daf2fec12db90c02aa74cb13726efd8f9b708312 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Mon, 29 Apr 2019 17:03:56 -0400 Subject: [PATCH 546/935] Misc. typo fixes Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib` --- Changelog.txt | 14 +++++++------- Makefile.rule | 6 +++--- README.md | 2 +- cmake/kernel.cmake | 2 +- cmake/system.cmake | 2 +- cmake/utils.cmake | 2 +- common_stackalloc.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- ctest/c_cblat1.f | 2 +- ctest/c_dblat1.f | 2 +- ctest/c_sblat1.f | 2 +- ctest/c_zblat1.f | 2 +- driver/others/blas_server.c | 6 +++--- driver/others/blas_server_win32.c | 4 ++-- driver/others/init.c | 2 +- driver/others/memory.c | 2 +- f_check | 2 +- interface/CMakeLists.txt | 2 +- interface/axpy.c | 2 +- interface/zaxpy.c | 2 +- reference/ctbmvf.f | 2 +- reference/ctpmvf.f | 2 +- reference/ctrmvf.f | 2 +- reference/dtbmvf.f | 2 +- reference/dtpmvf.f | 2 +- reference/dtrmvf.f | 2 +- reference/stbmvf.f | 2 +- reference/stpmvf.f | 2 +- reference/strmvf.f | 2 +- reference/ztbmvf.f | 2 +- reference/ztpmvf.f | 2 +- reference/ztrmvf.f | 2 +- test/cblat1.f | 2 +- test/dblat1.f | 2 +- test/sblat1.f | 2 +- test/zblat1.f | 2 +- 37 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..9feacf071 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -74,7 +74,7 @@ ARMv8: * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 IBM Z: - * optimized microkernels for single precicion BLAS1/2 functions have been added + * optimized microkernels for single precision BLAS1/2 functions have been added for both Z13 and Z14 ==================================================================== @@ -588,8 +588,8 @@ common: s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy. * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34) * Added NO_AVX2 flag for old binutils. (#401) - * Support outputing the CPU corename on runtime.(#407) - * Patched LAPACK to fix bug 114, 117, 118. + * Support outputting the CPU corename on runtime.(#407) + * Patched LAPACK to fix bug 114, 117, 118. (http://www.netlib.org/lapack/bug_list.html) * Disabled ?gemm3m for a work-around fix. (#400) x86/x86-64: @@ -628,7 +628,7 @@ Version 0.2.9.rc1 13-Jan-2013 common: * Update LAPACK to 3.5.0 version - * Fixed compatiable issues with Clang and Pathscale compilers. + * Fixed compatible issues with Clang and Pathscale compilers. x86/x86-64: * Optimization on Intel Haswell. @@ -705,7 +705,7 @@ Version 0.2.5 26-Nov-2012 common: * Added NO_SHARED flag to disable generating the shared library. - * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) + * Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158) * Export LAPACK 3.4.2 symbols in shared library. (#147) * Only detect the number of physical CPU cores on Mac OSX. (#157) * Fixed NetBSD build. (#155) @@ -896,7 +896,7 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. - * Work-around #27 the low performance axpy issue with small imput size & multithreads. + * Work-around #27 the low performance axpy issue with small input size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. @@ -919,7 +919,7 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86_64: - * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue + * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) diff --git a/Makefile.rule b/Makefile.rule index b46479d03..17815096e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -181,17 +181,17 @@ NO_AFFINITY = 1 # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz -# system). Also you can control this mumber by THREAD_TIMEOUT +# system). Also you can control this number by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 -# Using special device driver for mapping physically contigous memory +# Using special device driver for mapping physically contiguous memory # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 -# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute # with single thread. (Actually in recent versions this is a factor proportional to the # number of floating point operations necessary for the given problem size, no longer # an individual dimension). You can use this setting to avoid the overhead of multi- diff --git a/README.md b/README.md index 26055c745..76a65b74b 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` +- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` #### IBM zEnterprise System diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0ed09e776..9b238f004 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,7 +1,7 @@ # helper functions for the kernel CMakeLists.txt -# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. +# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. macro(SetDefaultL1) set(SAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7fda2adb9..d0f560872 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -283,7 +283,7 @@ endif () set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") -# TODO: nead to convert these Makefiles +# TODO: need to convert these Makefiles # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 28ef65f47..fd93f8a70 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) endfunction () -# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition +# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition # @param sources_in the source files to build from # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. diff --git a/common_stackalloc.h b/common_stackalloc.h index ec0fa1611..d3d54669c 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * SIZE must be carefully chosen to be: * - as small as possible to maximize the number of stack allocation * - large enough to support all architectures and kernel - * Chosing a too small SIZE will lead to a stack smashing. + * Choosing a SIZE too small will lead to a stack smashing. */ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ /* make it volatile because some function (ex: dgemv_n.S) */ \ diff --git a/common_x86.h b/common_x86.h index 3fdffe2a8..99adc9f5b 100644 --- a/common_x86.h +++ b/common_x86.h @@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index 718a81050..f59ff6627 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index c741ce506..1a123d74d 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index c570a9140..4a71b4dcf 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 773787d6f..89902f12d 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index 03753e782..cd0c8541d 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index e5db1804f..6f4e20610 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* jobs is queued. */ -/* We need this grobal for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; /* Local Variables */ @@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); #ifdef MONITOR -/* Monitor is a function to see thread's status for every seconds. */ -/* Usually it turns off and it's for debugging. */ +/* Monitor is a function to see thread's status for every second. */ +/* Usually it turns off and it's for debugging. */ static pthread_t monitor_thread; static int main_status[MAX_CPU_NUMBER]; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 0b38ee365..bace54a23 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -50,7 +50,7 @@ /* This is a thread implementation for Win32 lazy implementation */ -/* Thread server common infomation */ +/* Thread server common information */ typedef struct{ CRITICAL_SECTION lock; HANDLE filled; @@ -61,7 +61,7 @@ typedef struct{ } blas_pool_t; -/* We need this global for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; /* Local Variables */ diff --git a/driver/others/init.c b/driver/others/init.c index 012ef6647..0aad9c407 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than inital condition */ + /* if number of threads is larger than initial condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; diff --git a/driver/others/memory.c b/driver/others/memory.c index ac8545f35..3fe31168d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); } #endif diff --git a/f_check b/f_check index 34caa00be..b05db85bd 100644 --- a/f_check +++ b/f_check @@ -125,7 +125,7 @@ if ($compiler eq "") { $openmp = "-openmp"; } - # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. + # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; if ($data =~ / zho_ge__/) { $need2bu = 1; diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index f76d5c13f..5ea39f864 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES axpby.c ) -# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c diff --git a/interface/axpy.c b/interface/axpy.c index 9032946d2..eaa19f4df 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/interface/zaxpy.c b/interface/zaxpy.c index dbd559628..da3b48ead 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f index ff3c5268d..ada701d70 100644 --- a/reference/ctbmvf.f +++ b/reference/ctbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f index 340234270..ffc4766d2 100644 --- a/reference/ctpmvf.f +++ b/reference/ctpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f index f9d3b445a..9cd1d17ad 100644 --- a/reference/ctrmvf.f +++ b/reference/ctrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f index da340774e..621489085 100644 --- a/reference/dtbmvf.f +++ b/reference/dtbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f index e8f6eb412..492f9fd46 100644 --- a/reference/dtpmvf.f +++ b/reference/dtpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f index 0619d3eca..79b2eb806 100644 --- a/reference/dtrmvf.f +++ b/reference/dtrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stbmvf.f b/reference/stbmvf.f index 353e63ee8..f21e5aa8b 100644 --- a/reference/stbmvf.f +++ b/reference/stbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stpmvf.f b/reference/stpmvf.f index 1e93b843a..d97a695f5 100644 --- a/reference/stpmvf.f +++ b/reference/stpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/strmvf.f b/reference/strmvf.f index 249aff275..7614dcd32 100644 --- a/reference/strmvf.f +++ b/reference/strmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f index 8df5609ad..c8487cf7c 100644 --- a/reference/ztbmvf.f +++ b/reference/ztbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f index 7e52ef74e..5dc03bac9 100644 --- a/reference/ztpmvf.f +++ b/reference/ztpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f index 9e4f85380..5f52622e2 100644 --- a/reference/ztrmvf.f +++ b/reference/ztrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* tranformed vector x. +* transformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/test/cblat1.f b/test/cblat1.f index a4c996fda..d6b53d105 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/dblat1.f b/test/dblat1.f index f3255fef4..28af121cd 100644 --- a/test/dblat1.f +++ b/test/dblat1.f @@ -991,7 +991,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/sblat1.f b/test/sblat1.f index a5c1c6af6..fe05bbe87 100644 --- a/test/sblat1.f +++ b/test/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/zblat1.f b/test/zblat1.f index e2415e1c4..8b4b8d21e 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * From b43c8382c885551b0f230c8493e79bf04d94e366 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 May 2019 10:46:46 +0200 Subject: [PATCH 547/935] Correct argument of CPU_ISSET for glibc <2.5 fixes #2104 --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ac8545f35..db14cde02 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -229,7 +229,7 @@ int get_num_procs(void) { n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i Date: Wed, 1 May 2019 19:36:22 +0000 Subject: [PATCH 548/935] conflict resolve --- kernel/power/KERNEL.POWER9 | 10 +++++----- kernel/power/icamax.c | 2 +- kernel/power/icamin.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 6d5cf9068..0e0d62393 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -12,11 +12,11 @@ SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power9.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index 06fc5d8ad..bd74d20e5 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; - BLASLONG i; + BLASLONG i=0; #if defined(USE_MASK_PERMUTATIONS) register __vector unsigned int static_index0 = {0,1,2,3}; #else diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 36432c993..336766245 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { BLASLONG index; - BLASLONG i; + BLASLONG i=0; register __vector unsigned int static_index0 = {0,1,2,3}; register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} From 858e609e1feba715065a65034eef02c9516aa107 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sat, 4 May 2019 15:01:29 -0400 Subject: [PATCH 549/935] Revert reference/ fixes --- reference/ctbmvf.f | 2 +- reference/ctpmvf.f | 2 +- reference/ctrmvf.f | 2 +- reference/dtbmvf.f | 2 +- reference/dtpmvf.f | 2 +- reference/dtrmvf.f | 2 +- reference/stbmvf.f | 2 +- reference/stpmvf.f | 2 +- reference/strmvf.f | 2 +- reference/ztbmvf.f | 2 +- reference/ztpmvf.f | 2 +- reference/ztrmvf.f | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f index ada701d70..ff3c5268d 100644 --- a/reference/ctbmvf.f +++ b/reference/ctbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f index ffc4766d2..340234270 100644 --- a/reference/ctpmvf.f +++ b/reference/ctpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f index 9cd1d17ad..f9d3b445a 100644 --- a/reference/ctrmvf.f +++ b/reference/ctrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f index 621489085..da340774e 100644 --- a/reference/dtbmvf.f +++ b/reference/dtbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f index 492f9fd46..e8f6eb412 100644 --- a/reference/dtpmvf.f +++ b/reference/dtpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f index 79b2eb806..0619d3eca 100644 --- a/reference/dtrmvf.f +++ b/reference/dtrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stbmvf.f b/reference/stbmvf.f index f21e5aa8b..353e63ee8 100644 --- a/reference/stbmvf.f +++ b/reference/stbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/stpmvf.f b/reference/stpmvf.f index d97a695f5..1e93b843a 100644 --- a/reference/stpmvf.f +++ b/reference/stpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/strmvf.f b/reference/strmvf.f index 7614dcd32..249aff275 100644 --- a/reference/strmvf.f +++ b/reference/strmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f index c8487cf7c..8df5609ad 100644 --- a/reference/ztbmvf.f +++ b/reference/ztbmvf.f @@ -117,7 +117,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f index 5dc03bac9..7e52ef74e 100644 --- a/reference/ztpmvf.f +++ b/reference/ztpmvf.f @@ -77,7 +77,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f index 5f52622e2..9e4f85380 100644 --- a/reference/ztrmvf.f +++ b/reference/ztrmvf.f @@ -80,7 +80,7 @@ * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the -* transformed vector x. +* tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of From b46875b76b8d4ebbc320547c20f7f4486fe52563 Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sat, 4 May 2019 15:43:17 -0400 Subject: [PATCH 550/935] Revert Changelog.txt typos --- Changelog.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 9feacf071..8df35d5c3 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -74,7 +74,7 @@ ARMv8: * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 IBM Z: - * optimized microkernels for single precision BLAS1/2 functions have been added + * optimized microkernels for single precicion BLAS1/2 functions have been added for both Z13 and Z14 ==================================================================== @@ -588,8 +588,8 @@ common: s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy. * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34) * Added NO_AVX2 flag for old binutils. (#401) - * Support outputting the CPU corename on runtime.(#407) - * Patched LAPACK to fix bug 114, 117, 118. + * Support outputing the CPU corename on runtime.(#407) + * Patched LAPACK to fix bug 114, 117, 118. (http://www.netlib.org/lapack/bug_list.html) * Disabled ?gemm3m for a work-around fix. (#400) x86/x86-64: @@ -628,7 +628,7 @@ Version 0.2.9.rc1 13-Jan-2013 common: * Update LAPACK to 3.5.0 version - * Fixed compatible issues with Clang and Pathscale compilers. + * Fixed compatiable issues with Clang and Pathscale compilers. x86/x86-64: * Optimization on Intel Haswell. @@ -705,7 +705,7 @@ Version 0.2.5 26-Nov-2012 common: * Added NO_SHARED flag to disable generating the shared library. - * Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158) + * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) * Export LAPACK 3.4.2 symbols in shared library. (#147) * Only detect the number of physical CPU cores on Mac OSX. (#157) * Fixed NetBSD build. (#155) @@ -896,7 +896,7 @@ x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. - * Work-around #27 the low performance axpy issue with small input size & multithreads. + * Work-around #27 the low performance axpy issue with small imput size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. @@ -919,7 +919,7 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86_64: - * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause + * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) From 7ed8431527eb00f161de4dd309fd4d2b6c885b0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 May 2019 22:54:41 +0200 Subject: [PATCH 551/935] Disable the SkyLakeX DGEMMITCOPY kernel as well as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 5d0a300b5..3c678904d 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,7 +10,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c DGEMMINCOPY = dgemm_ncopy_8_skylakex.c -DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c From b1561ecc6864428baa4f1336d47d23729b9636f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 5 May 2019 15:52:01 +0200 Subject: [PATCH 552/935] Disable DGEMMINCOPY as well for now #1955 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 3c678904d..d61c51628 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -9,7 +9,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c #DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c From 5a9cce2bf6740110b93a534f876072f220d928d1 Mon Sep 17 00:00:00 2001 From: Fabrice Fontaine Date: Sun, 5 May 2019 18:37:28 +0200 Subject: [PATCH 553/935] Makefile.arm: remove -march flags The provided -march flags, especially for ARMv5 and ARMv6 may not necessarily match the needed ones: for ARMv5, it might be armv5, armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain sysroot can be used in a multilib toolchain. Therefore, let the user building OpenBLAS pass the appropriate -march flag. The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they are actually required for the build to proceed (OpenBLAS uses VFP instructions, and assume an EABIhf ABI). [Peter: update for v0.2.20] Signed-off-by: Thomas Petazzoni Signed-off-by: Peter Korsgaard [Retrieved from: https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch] Signed-off-by: Fabrice Fontaine --- Makefile.arm | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index eedd39b73..b5d80f8e6 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a @@ -9,11 +9,6 @@ endif endif ifeq ($(CORE), ARMV6) -CCOMMON_OPT += -mfpu=vfp -march=armv6 -FCOMMON_OPT += -mfpu=vfp -march=armv6 -endif - -ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -march=armv5 -FCOMMON_OPT += -march=armv5 +CCOMMON_OPT += -mfpu=vfp +FCOMMON_OPT += -mfpu=vfp endif From a6a8cc2b7fa30f46fdaa4fb6e50c19da8c11e335 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 May 2019 13:34:52 +0200 Subject: [PATCH 555/935] Fix errors in cpu enumeration with glibc 2.6 for #2114 --- driver/others/init.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 012ef6647..a29dce971 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than inital condition */ + /* if number of threads is larger than initial condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; @@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { + +#if defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 7) cpu_set_t *cpusetp; +#else + cpu_set_t cpuset; +#endif +#endif int nums; int ret; @@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) { } CPU_FREE(cpusetp); #else - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); if (ret!=0) { common->num_procs = nums; } else { @@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) { int i; int n = 0; for (i=0;inum_procs = n; } #else - common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); + common->num_procs = CPU_COUNT(&cpuset); } #endif From c516209581a77790b8d67d6dcd0c3f95fe713643 Mon Sep 17 00:00:00 2001 From: Diazonium Date: Tue, 7 May 2019 14:55:20 +0200 Subject: [PATCH 556/935] Change two http links to https Closes #2109 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76a65b74b..620e393f1 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documentation on the OpenBLAS wiki pages: . +Please read the documentation on the OpenBLAS wiki pages: . ## Binary Packages @@ -22,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source -Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code +Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code using Git from https://github.com/xianyi/OpenBLAS.git. ### Dependencies From 575a84398a1569738029594372f9143a6743c52c Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Tue, 7 May 2019 23:46:54 +0300 Subject: [PATCH 557/935] remove redundant code #2113 --- lapack/getrf/getrf_parallel.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 591ce4a99..c82defcab 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -279,9 +279,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) #if 1 { - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; @@ -368,9 +365,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if ((current != mypos) && (!is)) { #if 1 - LOCK_COMMAND(&getrf_lock); - jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; @@ -402,9 +396,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { #if 1 - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; From 7d1b468d9d83789d25eb6996afb5e358ee861f1d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 8 May 2019 09:58:01 +0800 Subject: [PATCH 558/935] Set up CI with Azure Pipelines [skip ci] --- azure-pipelines.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 azure-pipelines.yml diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 000000000..aa912913d --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,19 @@ +# Starter pipeline +# Start with a minimal pipeline that you can customize to build and deploy your code. +# Add steps that build, run tests, deploy, and more: +# https://aka.ms/yaml + +trigger: +- master + +pool: + vmImage: 'ubuntu-latest' + +steps: +- script: echo Hello, world! + displayName: 'Run a one-line script' + +- script: | + echo Add other tasks to build, test, and deploy your project. + echo See https://aka.ms/yaml + displayName: 'Run a multi-line script' From e47b63466b26dab9618443fd5754885bea653845 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Tue, 7 May 2019 16:06:42 -0700 Subject: [PATCH 559/935] TST: add native POWER8 to CI * add native POWER8 testing to Travis CI matrix with ppc64le os entry --- .travis.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.travis.yml b/.travis.yml index eee7674fe..00a2509f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,15 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" + - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From 70cea0b96b70330ae6ef80b954e708d6acd86911 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 12:20:00 +0200 Subject: [PATCH 560/935] Update link to IBM MASS library, update cpu support status --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 620e393f1..68a121498 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,7 @@ A debug version can be built using `make DEBUG=1`. ### Compile with MASS support on Power CPU (optional) -The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library -consists of a set of mathematical functions for C, C++, and Fortran applications that are -are tuned for optimum performance on POWER architectures. +The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. The library can be installed as shown: @@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. +- **AMD ZEN**: Uses Haswell codes with some optimizations. #### MIPS64 @@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` +- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` +- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. #### IBM zEnterprise System - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) +- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) ### Supported OS From 3a49e8c05aa24bba832e5e05bd8888fbee039919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 13:52:22 +0200 Subject: [PATCH 561/935] first try migrating one of the arm builds from travis --- azure-pipelines.yml | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index aa912913d..87b4de3f0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,6 +14,26 @@ steps: displayName: 'Run a one-line script' - script: | - echo Add other tasks to build, test, and deploy your project. - echo See https://aka.ms/yaml - displayName: 'Run a multi-line script' + docker run --rm --privileged multiarch/qemu-user-static:register --reset + ls /proc/sys/fs/binfmt_misc/ + condition: not(startsWith(variables['CONFIG'], 'linux_64')) + displayName: Configure binfmt_misc + +- script: | + echo "FROM openblas/alpine:arm32 + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=gcc cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=ARMV6 \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + +#- script: | +# echo Add other tasks to build, test, and deploy your project. +# echo See https://aka.ms/yaml +# displayName: 'Run a multi-line script' From 5cf434167ab9622c6788e4fdc9b418ab7bf96e61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 13:58:59 +0200 Subject: [PATCH 562/935] fix tabbing in azure commands --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 87b4de3f0..3b277073a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,10 +14,10 @@ steps: displayName: 'Run a one-line script' - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - ls /proc/sys/fs/binfmt_misc/ + docker run --rm --privileged multiarch/qemu-user-static:register --reset + ls /proc/sys/fs/binfmt_misc/ condition: not(startsWith(variables['CONFIG'], 'linux_64')) - displayName: Configure binfmt_misc + displayName: 'Configure binfmt_misc' - script: | echo "FROM openblas/alpine:arm32 @@ -32,7 +32,7 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - + displayname: 'Run ARMV6 docker build' #- script: | # echo Add other tasks to build, test, and deploy your project. # echo See https://aka.ms/yaml From aa4c41bad26bbb6d550ddad3141063c2260b7afd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 14:12:02 +0200 Subject: [PATCH 563/935] Update azure-pipelines.yml take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie) --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3b277073a..d7e6cdc9b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -15,9 +15,9 @@ steps: - script: | docker run --rm --privileged multiarch/qemu-user-static:register --reset - ls /proc/sys/fs/binfmt_misc/ - condition: not(startsWith(variables['CONFIG'], 'linux_64')) - displayName: 'Configure binfmt_misc' +# ls /proc/sys/fs/binfmt_misc/ +# condition: not(startsWith(variables['CONFIG'], 'linux_64')) +# displayName: 'Configure binfmt_misc' - script: | echo "FROM openblas/alpine:arm32 From 16fd8e3dbe510802860f1981321bf9cd70676de4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 14:14:22 +0200 Subject: [PATCH 564/935] Update azure-pipelines.yml --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d7e6cdc9b..12ea40b61 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,8 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - displayname: 'Run ARMV6 docker build' + displayName: 'Run ARMV6 docker build' + #- script: | # echo Add other tasks to build, test, and deploy your project. # echo See https://aka.ms/yaml From a598ab1d32c1d5fcf9b9eb0c503a24db13757bc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 15:23:54 +0200 Subject: [PATCH 565/935] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 12ea40b61..2b092c256 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,7 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - displayName: 'Run ARMV6 docker build' +# displayName: 'Run ARMV6 docker build' #- script: | # echo Add other tasks to build, test, and deploy your project. From dd77a3f0e27dee0c15b6e1da3649aba6723631ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 15:25:43 +0200 Subject: [PATCH 566/935] Update azure-pipelines.yml --- azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2b092c256..e25f11cb1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,6 +32,8 @@ steps: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . + + # displayName: 'Run ARMV6 docker build' #- script: | From ad20ceaa680e555e6f4e5e6d199f4c158ef1b6df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 May 2019 19:07:58 +0200 Subject: [PATCH 567/935] Update azure-pipelines.yml --- azure-pipelines.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e25f11cb1..0b1ba16fd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,14 +13,14 @@ steps: - script: echo Hello, world! displayName: 'Run a one-line script' -- script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset +#- script: | +# docker run --rm --privileged multiarch/qemu-user-static:register --reset # ls /proc/sys/fs/binfmt_misc/ # condition: not(startsWith(variables['CONFIG'], 'linux_64')) # displayName: 'Configure binfmt_misc' - script: | - echo "FROM openblas/alpine:arm32 + echo "FROM openblas/alpine:arm32 COPY . /tmp/openblas RUN mkdir /tmp/openblas/build && \ cd /tmp/openblas/build && \ @@ -31,10 +31,8 @@ steps: -D BUILD_WITHOUT_CBLAS=ON \ -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile - docker build . - - -# displayName: 'Run ARMV6 docker build' + docker build . + displayName: Run ARMV6 docker build #- script: | # echo Add other tasks to build, test, and deploy your project. From 53703585aa5ac170cabfe035a32bd0e07e1877c8 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Wed, 8 May 2019 15:14:01 -0700 Subject: [PATCH 568/935] DOC: Add Azure CI status badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 68a121498..14815ff00 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) +[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) + ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. From 406c7242f49730e45453544b601d717e02ebe07d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 May 2019 00:47:44 +0200 Subject: [PATCH 569/935] Add ARMV6 build to azure CI setup (#2122) using aytekinar's Alpine image and docker script from the Travis setup [skip ci] --- azure-pipelines.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0b1ba16fd..cef2ef973 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,19 +13,15 @@ steps: - script: echo Hello, world! displayName: 'Run a one-line script' -#- script: | -# docker run --rm --privileged multiarch/qemu-user-static:register --reset -# ls /proc/sys/fs/binfmt_misc/ -# condition: not(startsWith(variables['CONFIG'], 'linux_64')) -# displayName: 'Configure binfmt_misc' - - script: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset echo "FROM openblas/alpine:arm32 COPY . /tmp/openblas RUN mkdir /tmp/openblas/build && \ cd /tmp/openblas/build && \ - CC=gcc cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=ARMV6 \ + CC=gcc cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=ARMV6 \ + -D NOFORTRAN=ON \ -D BUILD_SHARED_LIBS=ON \ -D BUILD_WITHOUT_LAPACK=ON \ -D BUILD_WITHOUT_CBLAS=ON \ From 4efbac28ed42b79ac0ba27cfe065d38a3ba5af68 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Wed, 8 May 2019 18:51:59 -0700 Subject: [PATCH 570/935] TST: Azure manylinux1 & clean-up * remove some of the steps & comments from the original Azure yml template * modify the trigger section to use develop since OpenBLAS primarily uses this branch; use the same batching behavior as downstream projects NumPy/ SciPy * remove Travis emulated ARMv6 gcc build because this now happens in Azure * use documented Ubuntu vmImage name for Azure and add in a manylinux1 test run to the matrix [skip appveyor] --- .travis.yml | 8 ++----- azure-pipelines.yml | 57 +++++++++++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index 00a2509f9..82e2aaac8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -177,8 +177,8 @@ matrix: dist: trusty sudo: required services: docker - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc - name: "Emulated Build for ARMV6 with gcc" + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + name: "Emulated Build for ARMV6 with clang" before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset script: | echo "FROM openblas/alpine:${IMAGE_ARCH} @@ -193,9 +193,6 @@ matrix: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - - <<: *emulated-arm - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - name: "Emulated Build for ARMV6 with clang" - <<: *emulated-arm env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc name: "Emulated Build for ARMV8 with gcc" @@ -204,7 +201,6 @@ matrix: name: "Emulated Build for ARMV8 with clang" allow_failures: - - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cef2ef973..cbea6f4a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,21 +1,18 @@ -# Starter pipeline -# Start with a minimal pipeline that you can customize to build and deploy your code. -# Add steps that build, run tests, deploy, and more: -# https://aka.ms/yaml - trigger: -- master - -pool: - vmImage: 'ubuntu-latest' - -steps: -- script: echo Hello, world! - displayName: 'Run a one-line script' + # start a new build for every push + batch: False + branches: + include: + - develop -- script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - echo "FROM openblas/alpine:arm32 +jobs: +- job: ARMv6_gcc + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset + echo "FROM openblas/alpine:arm32 COPY . /tmp/openblas RUN mkdir /tmp/openblas/build && \ cd /tmp/openblas/build && \ @@ -27,10 +24,24 @@ steps: -D BUILD_WITHOUT_CBLAS=ON \ -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile - docker build . - displayName: Run ARMV6 docker build - -#- script: | -# echo Add other tasks to build, test, and deploy your project. -# echo See https://aka.ms/yaml -# displayName: 'Run a multi-line script' + docker build . + displayName: Run ARMV6 docker build +# manylinux1 is useful to test because the +# standard Docker container uses an old version +# of gcc / glibc +- job: manylinux1_gcc + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + echo "FROM quay.io/pypa/manylinux1_x86_64 + COPY . /tmp/openblas + RUN cd /tmp/openblas && \ + COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ + BTYPE='BINARY=64' CC=gcc && \ + make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ + make -C test $COMMON_FLAGS $BTYPE && \ + make -C ctest $COMMON_FLAGS $BTYPE && \ + make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile + docker build . + displayName: Run manylinux1 docker build From a3d4c65d62cf3689fe5840e65a7fcdb64d986435 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 May 2019 11:52:02 +0200 Subject: [PATCH 571/935] Add NO_AFFINITY to available options on Linux, and set it to ON to match the gmake default. Fixes second part of #2114 --- CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a27c1c0fc..50da721cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 6.dev) +set(OpenBLAS_PATCH_VERSION 7.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -20,9 +20,14 @@ if(MSVC) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) -option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) -option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) +option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") +option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) +else() +set(NO_AFFINITY 1) +endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using From 9ea30f3788b64b7f42acfaf08e234591aee33e23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 May 2019 14:42:36 +0200 Subject: [PATCH 572/935] Replace ISMIN and ISAMIN kernels on all x86_64 platforms (#2125) * Mark iamax_sse.S as unsuitable for MIN due to issue #2116 * Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116 --- kernel/x86_64/KERNEL | 4 +- kernel/x86_64/iamax_sse.S | 106 ++++++++++++++++++++------------------ 2 files changed, 58 insertions(+), 52 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4874711bb..92d121ab2 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S endif ifndef ISAMINKERNEL -ISAMINKERNEL = iamax_sse.S +ISAMINKERNEL = iamax.S endif ifndef IDAMINKERNEL @@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax_sse.S +ISMINKERNEL = iamax.S endif ifndef IDMINKERNEL diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index f22e34a1d..d50c1699c 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -36,6 +36,10 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/* This kernel was found to give wrong results when used for ISMIN/ISAMIN + with increment != 1, although it appears to be correct for corresponding + MAX operations. See issue 2116 */ + #define ASSEMBLER #include "common.h" @@ -48,9 +52,11 @@ #define XX %r10 #define MM %r11 +#define MAXPS maxps +#define MAXSS maxss #ifdef USE_MIN -#define maxps minps -#define maxss minss +#define MAXPS minps +#define MAXSS minss #endif #include "l1param.h" @@ -103,7 +109,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 decq M addq $SIZE, X ALIGN_3 @@ -117,7 +123,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm1 + MAXPS %xmm4, %xmm1 subq $2, M addq $2 * SIZE, X ALIGN_3 @@ -137,25 +143,25 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 movaps 8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 movaps 12 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -173,13 +179,13 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -191,7 +197,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -204,7 +210,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $2 * SIZE, X .L18: @@ -215,22 +221,22 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 ALIGN_3 .L20: movq XX, X movq MM, M - maxps %xmm1, %xmm0 - maxps %xmm3, %xmm2 - maxps %xmm2, %xmm0 + MAXPS %xmm1, %xmm0 + MAXPS %xmm3, %xmm2 + MAXPS %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - maxps %xmm1, %xmm0 + MAXPS %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - maxss %xmm1, %xmm0 + MAXSS %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 testq $4, X @@ -427,28 +433,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 movsd 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 movsd 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -467,14 +473,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -488,7 +494,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -501,7 +507,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $2 * SIZE, X .L38: @@ -512,7 +518,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 jmp .L40 ALIGN_4 @@ -520,15 +526,15 @@ movq XX, X movq MM, M - maxps %xmm1, %xmm0 - maxps %xmm3, %xmm2 - maxps %xmm2, %xmm0 + MAXPS %xmm1, %xmm0 + MAXPS %xmm3, %xmm2 + MAXPS %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - maxps %xmm1, %xmm0 + MAXPS %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - maxss %xmm1, %xmm0 + MAXSS %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I @@ -687,56 +693,56 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 decq I jg .L81 @@ -754,28 +760,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 ALIGN_3 .L86: @@ -787,14 +793,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 ALIGN_3 .L87: @@ -806,16 +812,16 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 ALIGN_4 .L90: movq XX, X movq MM, M - maxss %xmm1, %xmm0 - maxss %xmm3, %xmm2 - maxss %xmm2, %xmm0 + MAXSS %xmm1, %xmm0 + MAXSS %xmm3, %xmm2 + MAXSS %xmm2, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I From 3cb1c8d210046f4f6e2935fe796af3648387a38e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 May 2019 16:07:30 +0200 Subject: [PATCH 573/935] Move ARMv8 gcc build from Travis to Azure --- azure-pipelines.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cbea6f4a7..4673d07fe 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,6 +26,26 @@ jobs: cmake --build ." > Dockerfile docker build . displayName: Run ARMV6 docker build +- job: ARMv8_gcc + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset + echo "FROM openblas/alpine:arm64 + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=gcc cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=ARMV8 \ + -D NOFORTRAN=ON \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + displayName: Run ARMV8 docker build # manylinux1 is useful to test because the # standard Docker container uses an old version # of gcc / glibc From 999a04f101250a189c92919277db6cbc50a584ff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 May 2019 16:08:23 +0200 Subject: [PATCH 574/935] Move ARMv8 gcc build from Travis to Azure --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 82e2aaac8..eb74ded37 100644 --- a/.travis.yml +++ b/.travis.yml @@ -202,7 +202,6 @@ matrix: allow_failures: - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang # whitelist From 43068288e9fc035cd9ebc7254de7a5f0a3600090 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 May 2019 22:37:06 +0200 Subject: [PATCH 575/935] Update .travis.yml --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index eb74ded37..b2827997c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -193,9 +193,6 @@ matrix: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - - <<: *emulated-arm - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - name: "Emulated Build for ARMV8 with gcc" - <<: *emulated-arm env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang name: "Emulated Build for ARMV8 with clang" From d86f0b9e74130ab659062bca40badc1dc36649f0 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:35:07 -0500 Subject: [PATCH 576/935] Test drone CI --- .drone.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .drone.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 000000000..b2300b81d --- /dev/null +++ b/.drone.yml @@ -0,0 +1,19 @@ +--- +kind: pipeline +name: arm64_gcc + +platform: + os: linux + arch: arm64 + +steps: +- name: Build + image: centos:7 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS" From 58829c098841d2da28defa96538a7a2f9d3e0f21 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:39:51 -0500 Subject: [PATCH 577/935] install make --- .drone.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index b2300b81d..75868e919 100644 --- a/.drone.yml +++ b/.drone.yml @@ -7,12 +7,13 @@ platform: arch: arm64 steps: -- name: Build +- name: Build and Test image: centos:7 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: + - sudo yum -y install make - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From ff807473bb6e0faf8e7767c18b5cfae1318e0aaa Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:40:23 -0500 Subject: [PATCH 578/935] remove sudo --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 75868e919..da9520975 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - sudo yum -y install make + - yum -y install make - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 21acf03e9a2b21e39fa6e81899f100084de0ba93 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:42:16 -0500 Subject: [PATCH 579/935] Install gcc --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index da9520975..c4f216ed6 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make + - yum -y install make gcc - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 15f925fe9a0ca823352fd252cad2da95c810cec4 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:44:15 -0500 Subject: [PATCH 580/935] Install perl --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index c4f216ed6..765c2b02c 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc + - yum -y install make gcc perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From a0aaf308ed682d58962f1dd6f568647e97572596 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:47:49 -0500 Subject: [PATCH 581/935] Install gfortran and add a clang job --- .drone.yml | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 765c2b02c..3b1515c33 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,6 +1,6 @@ --- kind: pipeline -name: arm64_gcc +name: arm64_gcc_make platform: os: linux @@ -13,7 +13,28 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc perl + - yum -y install make gcc gfortran perl + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS" + +--- +kind: pipeline +name: arm64_clang_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: centos:7 + environment: + CC: clang + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - yum -y install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 9184590c33e9b8df68460877a0d56e229d21d2ce Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:50:37 -0500 Subject: [PATCH 582/935] gfortran->gcc-gfortran --- .drone.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 3b1515c33..37ca7478f 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,7 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc gfortran perl + - yum -y install make gcc gcc-gfortran perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -34,7 +34,7 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - - yum -y install make gcc gfortran perl clang + - yum -y install make gcc gcc-gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From dc110e179d5110bb807ee9c962e9b7da938ac9a6 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:53:58 -0500 Subject: [PATCH 583/935] Switch to ubuntu and parallel jobs --- .drone.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.drone.yml b/.drone.yml index 37ca7478f..f048cad1f 100644 --- a/.drone.yml +++ b/.drone.yml @@ -8,12 +8,12 @@ platform: steps: - name: Build and Test - image: centos:7 + image: ubuntu:18.04 environment: CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - yum -y install make gcc gcc-gfortran perl + - apt install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -29,12 +29,12 @@ platform: steps: - name: Build and Test - image: centos:7 + image: ubuntu:18.04 environment: CC: clang - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - yum -y install make gcc gcc-gfortran perl clang + - apt install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 612c2d78e0589634de791c72769c978c2fdc0141 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:55:04 -0500 Subject: [PATCH 584/935] apt update --- .drone.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index f048cad1f..973e00c00 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,7 +13,8 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt install make gcc gfortran perl clang + - apt-get update + - apt-get install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -34,7 +35,8 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt install make gcc gfortran perl clang + - apt-get update + - apt-get install make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From 231472c4c6c5e4b76000e62b1ad8b0a0b25c6ed4 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:55:38 -0500 Subject: [PATCH 585/935] Fix typo --- .drone.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 973e00c00..5fe9983ae 100644 --- a/.drone.yml +++ b/.drone.yml @@ -18,7 +18,7 @@ steps: - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS" + - make -C utest $COMMON_FLAGS --- kind: pipeline @@ -40,4 +40,4 @@ steps: - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS" + - make -C utest $COMMON_FLAGS From 608cd69b66059de14b29639ab29957c99190be5c Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 13:56:59 -0500 Subject: [PATCH 586/935] update yes --- .drone.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index 5fe9983ae..6413bd1c9 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,8 +13,8 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt-get update - - apt-get install make gcc gfortran perl clang + - apt-get update -y + - apt-get install -y make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -35,8 +35,8 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' commands: - - apt-get update - - apt-get install make gcc gfortran perl clang + - apt-get update -y + - apt-get install -y make gcc gfortran perl clang - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From d40c109eb0ace38d967e221308496854d207a70f Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:02:39 -0500 Subject: [PATCH 587/935] no need of gcc in clang build --- .drone.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index 6413bd1c9..0d7fd2000 100644 --- a/.drone.yml +++ b/.drone.yml @@ -11,10 +11,10 @@ steps: image: ubuntu:18.04 environment: CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - apt-get update -y - - apt-get install -y make gcc gfortran perl clang + - apt-get install -y make $CC gfortran perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -33,10 +33,10 @@ steps: image: ubuntu:18.04 environment: CC: clang - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j' + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: - apt-get update -y - - apt-get install -y make gcc gfortran perl clang + - apt-get install -y make $CC gfortran perl - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS From dadafcdcd84ffa8f5545a14aa3b2c0b39398195c Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:06:04 -0500 Subject: [PATCH 588/935] Add a cmake build as well --- .drone.yml | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/.drone.yml b/.drone.yml index 0d7fd2000..70bfc5f19 100644 --- a/.drone.yml +++ b/.drone.yml @@ -41,3 +41,47 @@ steps: - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm64_gcc_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + commands: + - apt-get update -y + - apt-get install -y make $CC gfortran perl cmake + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - cmake --build . + - ctest + +--- +kind: pipeline +name: arm64_clang_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + commands: + - apt-get update -y + - apt-get install -y make $CC gfortran perl cmake + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - cmake --build . + - ctest From cd99dfe034e3df8ad850dbae96e2e1deac6fdc2d Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:09:29 -0500 Subject: [PATCH 589/935] Add cmake builds and print options --- .drone.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.drone.yml b/.drone.yml index 70bfc5f19..a7224036e 100644 --- a/.drone.yml +++ b/.drone.yml @@ -13,8 +13,10 @@ steps: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl + - $CC --version - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -35,8 +37,10 @@ steps: CC: clang COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl + - $CC --version - make QUIET_MAKE=1 $COMMON_FLAGS - make -C test $COMMON_FLAGS - make -C ctest $COMMON_FLAGS @@ -57,8 +61,10 @@ steps: CC: gcc CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl cmake + - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - cmake --build . @@ -79,8 +85,10 @@ steps: CC: clang CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - apt-get install -y make $CC gfortran perl cmake + - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - cmake --build . From 3d94ab660f6352d31ef4a92835fd7506869cb80d Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:17:12 -0500 Subject: [PATCH 590/935] build without lapack on cmake --- .drone.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index a7224036e..3df5f406a 100644 --- a/.drone.yml +++ b/.drone.yml @@ -59,11 +59,12 @@ steps: image: ubuntu:18.04 environment: CC: gcc - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + CXX: g++ + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC gfortran perl cmake + - apt-get install -y make $CC $CXX g++ perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. @@ -83,11 +84,12 @@ steps: image: ubuntu:18.04 environment: CC: clang - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32' + CXX: clang++ + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC gfortran perl cmake + - apt-get install -y make $CC $CXX perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. From 7aa6faad5f17cbd6e477e0c393a3ae853e610de8 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:22:36 -0500 Subject: [PATCH 591/935] parallel build --- .drone.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.drone.yml b/.drone.yml index 3df5f406a..a8c69f8ca 100644 --- a/.drone.yml +++ b/.drone.yml @@ -59,16 +59,15 @@ steps: image: ubuntu:18.04 environment: CC: gcc - CXX: g++ CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC $CXX g++ perl cmake + - apt-get install -y make $CC g++ perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - - cmake --build . + - make -j - ctest --- @@ -84,14 +83,13 @@ steps: image: ubuntu:18.04 environment: CC: clang - CXX: clang++ CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' commands: - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - apt-get update -y - - apt-get install -y make $CC $CXX perl cmake + - apt-get install -y make $CC g++ perl cmake - $CC --version - mkdir build && cd build - cmake $CMAKE_FLAGS .. - - cmake --build . + - make -j - ctest From e3cb8ad2d6cef8a56d8a0543d58c678f7b068ecd Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 14:28:48 -0500 Subject: [PATCH 592/935] See if ubuntu 19.04 fixes the ICE --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index a8c69f8ca..46f259794 100644 --- a/.drone.yml +++ b/.drone.yml @@ -8,7 +8,7 @@ platform: steps: - name: Build and Test - image: ubuntu:18.04 + image: ubuntu:19.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' From 7ff44e0016f1f1bdeb518e108d9ae65e30004233 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 15:09:53 -0500 Subject: [PATCH 593/935] Remove qemu armv8 builds --- .travis.yml | 7 ------- azure-pipelines.yml | 20 -------------------- 2 files changed, 27 deletions(-) diff --git a/.travis.yml b/.travis.yml index b2827997c..dc388459b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -193,13 +193,6 @@ matrix: -D CMAKE_BUILD_TYPE=Release ../ && \ cmake --build ." > Dockerfile docker build . - - <<: *emulated-arm - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang - name: "Emulated Build for ARMV8 with clang" - - allow_failures: - - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang # whitelist branches: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4673d07fe..cbea6f4a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,26 +26,6 @@ jobs: cmake --build ." > Dockerfile docker build . displayName: Run ARMV6 docker build -- job: ARMv8_gcc - pool: - vmImage: 'ubuntu-16.04' - steps: - - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - echo "FROM openblas/alpine:arm64 - COPY . /tmp/openblas - RUN mkdir /tmp/openblas/build && \ - cd /tmp/openblas/build && \ - CC=gcc cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=ARMV8 \ - -D NOFORTRAN=ON \ - -D BUILD_SHARED_LIBS=ON \ - -D BUILD_WITHOUT_LAPACK=ON \ - -D BUILD_WITHOUT_CBLAS=ON \ - -D CMAKE_BUILD_TYPE=Release ../ && \ - cmake --build ." > Dockerfile - docker build . - displayName: Run ARMV8 docker build # manylinux1 is useful to test because the # standard Docker container uses an old version # of gcc / glibc From b911525c81063db8b7525800cff2a7d842b99518 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 15:14:46 -0500 Subject: [PATCH 594/935] arm32 build --- .drone.yml | 48 +++++++++++++++++++++++++++++++++++++++++++++ .travis.yml | 21 -------------------- azure-pipelines.yml | 20 ------------------- 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/.drone.yml b/.drone.yml index 46f259794..aa9e129e0 100644 --- a/.drone.yml +++ b/.drone.yml @@ -22,6 +22,30 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS +--- +kind: pipeline +name: arm32_gcc_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + --- kind: pipeline name: arm64_clang_make @@ -46,6 +70,30 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS +--- +kind: pipeline +name: arm32_clang_cmake + +platform: + os: linux + arch: arm + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest + --- kind: pipeline name: arm64_gcc_cmake diff --git a/.travis.yml b/.travis.yml index dc388459b..a92bb0687 100644 --- a/.travis.yml +++ b/.travis.yml @@ -173,27 +173,6 @@ matrix: env: - BTYPE="BINARY=32" - - &emulated-arm - dist: trusty - sudo: required - services: docker - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - name: "Emulated Build for ARMV6 with clang" - before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset - script: | - echo "FROM openblas/alpine:${IMAGE_ARCH} - COPY . /tmp/openblas - RUN mkdir /tmp/openblas/build && \ - cd /tmp/openblas/build && \ - CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=${TARGET_ARCH} \ - -D BUILD_SHARED_LIBS=ON \ - -D BUILD_WITHOUT_LAPACK=ON \ - -D BUILD_WITHOUT_CBLAS=ON \ - -D CMAKE_BUILD_TYPE=Release ../ && \ - cmake --build ." > Dockerfile - docker build . - # whitelist branches: only: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cbea6f4a7..7197062d1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -6,26 +6,6 @@ trigger: - develop jobs: -- job: ARMv6_gcc - pool: - vmImage: 'ubuntu-16.04' - steps: - - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset - echo "FROM openblas/alpine:arm32 - COPY . /tmp/openblas - RUN mkdir /tmp/openblas/build && \ - cd /tmp/openblas/build && \ - CC=gcc cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=ARMV6 \ - -D NOFORTRAN=ON \ - -D BUILD_SHARED_LIBS=ON \ - -D BUILD_WITHOUT_LAPACK=ON \ - -D BUILD_WITHOUT_CBLAS=ON \ - -D CMAKE_BUILD_TYPE=Release ../ && \ - cmake --build ." > Dockerfile - docker build . - displayName: Run ARMV6 docker build # manylinux1 is useful to test because the # standard Docker container uses an old version # of gcc / glibc From b43deb4ad60b2960b4c0ee1aca6afeaadc30673c Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 12 May 2019 15:25:45 -0500 Subject: [PATCH 595/935] Fix typo --- .drone.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index aa9e129e0..779912954 100644 --- a/.drone.yml +++ b/.drone.yml @@ -28,11 +28,11 @@ name: arm32_gcc_make platform: os: linux - arch: arm64 + arch: arm steps: - name: Build and Test - image: ubuntu:18.04 + image: ubuntu:19.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' From a211bc9b6a6e597a38fc8b8b7ed0b006cb367c46 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Tue, 14 May 2019 11:32:23 -0700 Subject: [PATCH 596/935] TST: add SkylakeX AVX512 CI test * adapt the C-level reproducer code for some recent SkylakeX AVX512 kernel issues, provided by Isuru Fernando and modified by Martin Kroeker, for usage in the utest suite * add an Intel SDE SkylakeX emulation utest run to the Azure CI matrix; a custom Docker build was required because Ubuntu image provided by Azure does not support AVX512VL instructions --- azure-pipelines.yml | 24 ++++++++++++++++++ utest/CMakeLists.txt | 1 + utest/Makefile | 1 + utest/test_kernel_regress.c | 50 +++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 utest/test_kernel_regress.c diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7197062d1..9b4c85367 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -25,3 +25,27 @@ jobs: make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile docker build . displayName: Run manylinux1 docker build +- job: Intel_SDE_skx + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + # at the time of writing the available Azure Ubuntu vm image + # does not support AVX512VL, so use more recent LTS version + echo "FROM ubuntu:bionic + COPY . /tmp/openblas + RUN apt-get -y update && apt-get -y install \\ + cmake \\ + gfortran \\ + make \\ + wget + RUN mkdir /tmp/SDE && cd /tmp/SDE && \\ + mkdir sde-external-8.35.0-2019-03-11-lin && \\ + wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\ + tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1 + RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64 + CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile + docker build -t intel_sde . + # we need a privileged docker run for sde process attachment + docker run --privileged intel_sde + displayName: 'Run AVX512 SkylakeX docker build / test' diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dc306501f..4e647cadc 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,6 +38,7 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c + test_kernel_regress.c ) endif() diff --git a/utest/Makefile b/utest/Makefile index 550a65569..cbe639cdb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -13,6 +13,7 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +OBJS += test_kernel_regress.o endif #this does not work with OpenMP nor with native Windows or Android threads diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c new file mode 100644 index 000000000..93a30b30c --- /dev/null +++ b/utest/test_kernel_regress.c @@ -0,0 +1,50 @@ +#include "openblas_utest.h" +#include +#include +#include + +#define LAPACK_ROW_MAJOR 101 +blasint LAPACKE_dgesvd( blasint matrix_layout, char jobu, char jobvt, + blasint m, blasint n, double* a, + blasint lda, double* s, double* u, blasint ldu, + double* vt, blasint ldvt, double* superb ); + + +#define DATASIZE 100 + +double s[DATASIZE]; +double u[DATASIZE*DATASIZE]; +double vt[DATASIZE*DATASIZE]; +double X[DATASIZE*DATASIZE]; +double superb[DATASIZE]; +double tmp[DATASIZE*DATASIZE]; +double m[DATASIZE*DATASIZE]; + +CTEST(kernel_regress,skx_avx) +{ + double norm; + int i, j, info; + srand(0); + for (i = 0; i < DATASIZE*DATASIZE; i++) { + m[i] = (rand()+0.0)/RAND_MAX * 10; + tmp[i] = m[i]; + } + + info = LAPACKE_dgesvd( LAPACK_ROW_MAJOR, 'A', 'A', DATASIZE, DATASIZE, m, DATASIZE, + s, u, DATASIZE, vt, DATASIZE, superb); + + for (i = 0; i < DATASIZE; i++) { + for (j = 0; j < DATASIZE; j++) { + u[i*DATASIZE+j] = u[i*DATASIZE+j]*s[j]; + } + } + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + DATASIZE, DATASIZE, DATASIZE, 1, u, DATASIZE, vt, DATASIZE, 0, X, DATASIZE); + + for (i = 0; i < DATASIZE*DATASIZE; i++) { + X[i] = X[i] - tmp[i]; + } + + norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1); + ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10); +} From d2cb610272137536416df2e44f1bc8175ddd4eaf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:18:43 +0200 Subject: [PATCH 597/935] Add option USE_LOCKING for single-threaded build with locking support for calling from concurrent threads --- Makefile.rule | 10 ++++++++-- Makefile.system | 12 ++++++++++++ common.h | 4 ++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 17815096e..faf8c8013 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -56,7 +56,13 @@ VERSION = 0.3.7.dev # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -# USE_THREAD = 0 +USE_THREAD = 0 + +# If you want to build a single-threaded OpenBLAS, but expect to call this +# from several concurrent threads in some other program, comment this in for +# thread safety. (This is done automatically for USE_THREAD=1 , and should not +# be necessary when USE_OPENMP=1) +# USE_LOCKING = 1 # If you're going to use this library with OpenMP, please comment it in. # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. @@ -220,7 +226,7 @@ NO_AFFINITY = 1 COMMON_PROF = -pg # Build Debug version -# DEBUG = 1 +DEBUG = 1 # Set maximum stack allocation. # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV diff --git a/Makefile.system b/Makefile.system index a95d6190f..29aef7e27 100644 --- a/Makefile.system +++ b/Makefile.system @@ -237,6 +237,10 @@ SMP = 1 endif endif +ifeq ($(SMP), 1) +USE_LOCKING = +endif + ifndef NEED_PIC NEED_PIC = 1 endif @@ -388,6 +392,12 @@ ifneq ($(MAX_STACK_ALLOC), 0) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) endif +ifdef USE_LOCKING +ifneq ($(USE_LOCKING), 0) +CCOMMON_OPT += -DUSE_LOCKING +endif +endif + # # Architecture dependent settings # @@ -744,6 +754,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive +# work around ABI changes in gfortran 9 that break calls from C code +FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran diff --git a/common.h b/common.h index 0ac74bb20..a9fe8d911 100644 --- a/common.h +++ b/common.h @@ -131,7 +131,7 @@ extern "C" { #include #include #include -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #include #endif #endif @@ -200,7 +200,7 @@ extern "C" { #error "You can't specify both LOCK operation!" #endif -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #define USE_PTHREAD_LOCK #undef USE_PTHREAD_SPINLOCK #endif From 1e52572be38541cc11ac39cef6cded8a640bb65b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:19:30 +0200 Subject: [PATCH 598/935] Add option USE_LOCKING for single-threaded build with locking support --- cmake/system.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index d0f560872..adedd32cc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -136,6 +136,10 @@ endif () if (USE_THREAD) message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") +else() + if (${USE_LOCKING}) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING") + endif () endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") From 86dda5c2fa9e298deacdd17211e2c4e58f2688ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:21:20 +0200 Subject: [PATCH 599/935] Add option USE_LOCKING for SMP-like locking in USE_THREAD=0 builds --- driver/others/memory.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 02352b3ae..adb1ec86c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2062,13 +2062,13 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif } @@ -2214,13 +2214,13 @@ static void *alloc_mmap(void *address){ #endif if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif } @@ -2701,7 +2701,7 @@ void *blas_memory_alloc(int procpos){ position = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif do { @@ -2718,7 +2718,7 @@ void *blas_memory_alloc(int procpos){ position ++; } while (position < NUM_BUFFERS); -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif goto error; @@ -2730,7 +2730,7 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #else blas_unlock(&memory[position].lock); @@ -2779,11 +2779,11 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -2839,7 +2839,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) @@ -2855,7 +2855,7 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -2872,7 +2872,7 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif return; @@ -2924,7 +2924,7 @@ void blas_shutdown(void){ #if defined(OS_LINUX) && !defined(NO_WARMUP) -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) @@ -2949,7 +2949,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, if (hot_alloc != 2) { #endif -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) LOCK_COMMAND(&init_lock); #endif @@ -2959,7 +2959,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size -= PAGESIZE; } -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) UNLOCK_COMMAND(&init_lock); #endif From 5ecffc28f2c32a23222ab633c904c9886923ecf1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:36:17 +0200 Subject: [PATCH 600/935] Add option USE_LOCKING but keep default settings intact --- Makefile.rule | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index faf8c8013..255d1da46 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -56,7 +56,7 @@ VERSION = 0.3.7.dev # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -USE_THREAD = 0 +# USE_THREAD = 0 # If you want to build a single-threaded OpenBLAS, but expect to call this # from several concurrent threads in some other program, comment this in for @@ -226,7 +226,7 @@ NO_AFFINITY = 1 COMMON_PROF = -pg # Build Debug version -DEBUG = 1 +# DEBUG = 1 # Set maximum stack allocation. # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV From f66c11fc22fa01eb8e120d4274d262b3795e4281 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 May 2019 23:38:12 +0200 Subject: [PATCH 601/935] Remove unrelated change --- Makefile.system | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 29aef7e27..f574edf88 100644 --- a/Makefile.system +++ b/Makefile.system @@ -754,8 +754,6 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive -# work around ABI changes in gfortran 9 that break calls from C code -FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran From 1778fd4219688e84463844f3aeaf824ca4043b31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 May 2019 13:48:27 +0200 Subject: [PATCH 602/935] Do not try ancient PGI hacks with recent versions of that compiler should fix #2139 --- driver/others/memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 02352b3ae..bf2cfb996 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif @@ -3192,7 +3194,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); - +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -3200,6 +3202,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif From 940f38f6dd504c02a554470b53545270e8e5a351 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 May 2019 13:02:23 +0200 Subject: [PATCH 603/935] Build and run utests in any case, they do their own checks for fortran availability --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 273fde33e..aed248ef2 100644 --- a/Makefile +++ b/Makefile @@ -123,8 +123,8 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all - $(MAKE) -C utest all endif + $(MAKE) -C utest all ifndef NO_CBLAS $(MAKE) -C ctest all endif From 79366ff7a9548e7eb5d200c7ac444d35b28f2b7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 May 2019 20:34:22 +0200 Subject: [PATCH 604/935] Add softfp support in min/max kernels fix for #1912 --- kernel/arm/iamax_vfp.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index fd43b15b1..ae362935e 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -469,9 +469,11 @@ iamax_kernel_S10: iamax_kernel_L999: - +#if !defined(__ARM_PCS_VFP) + vmov r0, s0 +#else mov r0, INDEX // set return value - +#endif pop {r4} bx lr From d76b20b4d2617582c8e1ac8a5aeb079e5c9de6f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 29 May 2019 14:07:17 +0200 Subject: [PATCH 605/935] Revert "Add softfp support in min/max kernels" --- kernel/arm/iamax_vfp.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index ae362935e..fd43b15b1 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -469,11 +469,9 @@ iamax_kernel_S10: iamax_kernel_L999: -#if !defined(__ARM_PCS_VFP) - vmov r0, s0 -#else + mov r0, INDEX // set return value -#endif + pop {r4} bx lr From c70496b1082983e4d68a2513486a9d2fcbef44e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 29 May 2019 15:02:51 +0200 Subject: [PATCH 606/935] Separate implementations of AMAX and IAMAX on arm As noted in #1912 and comment on #1942, the combined implementation happens to "do the right thing" on hardfp, but cannot return both value and index on softfp where they would have to share the return register --- kernel/arm/KERNEL.ARMV6 | 24 +-- kernel/arm/amax_vfp.S | 441 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 453 insertions(+), 12 deletions(-) create mode 100644 kernel/arm/amax_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b773a5ba0..1c561deb6 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,20 +1,20 @@ include $(KERNELDIR)/KERNEL.ARMV5 -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S +SAMAXKERNEL = amax_vfp.S +DAMAXKERNEL = amax_vfp.S +CAMAXKERNEL = amax_vfp.S +ZAMAXKERNEL = amax_vfp.S -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S +SAMINKERNEL = amax_vfp.S +DAMINKERNEL = amax_vfp.S +CAMINKERNEL = amax_vfp.S +ZAMINKERNEL = amax_vfp.S -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S +SMAXKERNEL = amax_vfp.S +DMAXKERNEL = amax_vfp.S -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S +SMINKERNEL = amax_vfp.S +DMINKERNEL = amax_vfp.S ISAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S new file mode 100644 index 000000000..c780ce5bd --- /dev/null +++ b/kernel/arm/amax_vfp.S @@ -0,0 +1,441 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(USE_ABS) + +#if defined(DOUBLE) + +#define VABS(x0,x1) vabs.f64 x0, x1 + +#else + +#define VABS(x0,x1) vabs.f32 x0, x1 + +#endif + +#else + +#define VABS(x0,x1) nop + +#endif + +/*****************************************************************************************/ + +#if defined(USE_MIN) + +#define MOVCOND movlt + +#if defined(DOUBLE) + +#define VMOVCOND vmovlt.f64 + +#else + +#define VMOVCOND vmovlt.f32 + +#endif + +#else + +#define MOVCOND movgt + +#if defined(DOUBLE) + +#define VMOVCOND vmovgt.f64 + +#else + +#define VMOVCOND vmovgt.f32 + +#endif + + +#endif + + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro INIT_F + + vldmia.f64 X!, { d0 } + VABS( d0, d0 ) + +.endm + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 } + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + +.endm + +.macro INIT_S + + vldmia.f64 X, { d0 } + VABS( d0, d0 ) + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 } + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + vldmia.f32 X!, { s0 } + VABS( s0, s0 ) + +.endm + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 } + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + +.endm + +.macro INIT_S + + vldmia.f32 X, { s0 } + VABS( s0, s0 ) + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 } + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro INIT_F + + vldmia.f64 X!, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 +.endm + + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 - d5 } + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + +.endm + +.macro INIT_S + + vldmia.f64 X, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 - d5 } + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + vldmia.f32 X!, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + +.endm + + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 - s5 } + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + +.endm + +.macro INIT_S + + vldmia.f32 X, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 - s5 } + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + add X, X, INC_X + +.endm + + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + movs r12, #0 // clear floating point register + vmov s0, r12 +#if defined(DOUBLE) + vcvt.f64.f32 d0, s0 +#endif + + + cmp N, #0 + ble amax_kernel_L999 + + cmp INC_X, #0 + beq amax_kernel_L999 + + + cmp INC_X, #1 + bne amax_kernel_S_BEGIN + + +amax_kernel_F_BEGIN: + + INIT_F + + subs N, N , #1 + ble amax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble amax_kernel_F1 + + .align 5 + +amax_kernel_F4: + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + ble amax_kernel_F1 + + +#if defined(COMPLEX) || defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F4 + +amax_kernel_F1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F10 + + b amax_kernel_L999 + +amax_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + INIT_S + + subs N, N , #1 + ble amax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble amax_kernel_S1 + + .align 5 + +amax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S4 + +amax_kernel_S1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S10 + + +amax_kernel_L999: +#if !defined(__ARM_PCS_VFP) + vmov r0, s0 +#endif + bx lr + + EPILOGUE + From c5495d20563d9a7a142c6726d24c0fd485fcedf6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 May 2019 11:25:43 +0200 Subject: [PATCH 607/935] Ensure correct output for DAMAX with softfp --- kernel/arm/amax_vfp.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S index c780ce5bd..d3770ea1e 100644 --- a/kernel/arm/amax_vfp.S +++ b/kernel/arm/amax_vfp.S @@ -432,8 +432,12 @@ amax_kernel_S10: amax_kernel_L999: -#if !defined(__ARM_PCS_VFP) +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else vmov r0, s0 +#endif #endif bx lr From 74c10b57c6ea9d80f77c469b50f90989843b0bb9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 May 2019 11:38:11 +0200 Subject: [PATCH 608/935] Use generic kernels for complex (I)AMAX to support softfp --- kernel/arm/KERNEL.ARMV6 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1c561deb6..344a71885 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -2,13 +2,13 @@ include $(KERNELDIR)/KERNEL.ARMV5 SAMAXKERNEL = amax_vfp.S DAMAXKERNEL = amax_vfp.S -CAMAXKERNEL = amax_vfp.S -ZAMAXKERNEL = amax_vfp.S +#CAMAXKERNEL = amax_vfp.S +#ZAMAXKERNEL = amax_vfp.S SAMINKERNEL = amax_vfp.S DAMINKERNEL = amax_vfp.S -CAMINKERNEL = amax_vfp.S -ZAMINKERNEL = amax_vfp.S +#CAMINKERNEL = amax_vfp.S +#ZAMINKERNEL = amax_vfp.S SMAXKERNEL = amax_vfp.S DMAXKERNEL = amax_vfp.S @@ -18,13 +18,13 @@ DMINKERNEL = amax_vfp.S ISAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S +#ICAMAXKERNEL = iamax_vfp.S +#IZAMAXKERNEL = iamax_vfp.S ISAMINKERNEL = iamax_vfp.S IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S +#ICAMINKERNEL = iamax_vfp.S +#IZAMINKERNEL = iamax_vfp.S ISMAXKERNEL = iamax_vfp.S IDMAXKERNEL = iamax_vfp.S From 8fe794f059a29922f1a4de7ecd143f35c79eb7e9 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Thu, 23 May 2019 04:23:43 +0000 Subject: [PATCH 609/935] improved zgemm power9 based on power8 --- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/sgemm_kernel_power9.S | 2 +- kernel/power/sgemm_logic_power9.S | 40 +- kernel/power/zgemm_kernel_power9.S | 257 +++++ kernel/power/zgemm_logic_power9.S | 857 ++++++++++++++ kernel/power/zgemm_macros_power9.S | 1664 ++++++++++++++++++++++++++++ param.h | 4 +- 7 files changed, 2802 insertions(+), 24 deletions(-) create mode 100644 kernel/power/zgemm_kernel_power9.S create mode 100644 kernel/power/zgemm_logic_power9.S create mode 100644 kernel/power/zgemm_macros_power9.S diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 0e0d62393..5c10ad64a 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy.o CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o -ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index a44659468..f408cdc17 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 + xscvdpspn alpha_r,vs1 xxspltw alpha_r,alpha_r,0 diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 300e30470..c149cb903 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -53,9 +53,9 @@ LSGEMM_L8x16_BEGIN: LSGEMM_L8x16_LOOP_START: LOAD8x16_0 /*we already zeroed */ - ##OffsetA=64 OffsetB=32 - addi AO,AO,2112 - addi BO,BO,32 + /*##OffsetA=64 OffsetB=32 + #addi AO,AO,2112 + #addi BO,BO,32 */ mtctr L @@ -63,29 +63,29 @@ LSGEMM_L8x16_LOOP_START: LSGEMM_L8x16_LOOP: - KERNEL8x16_I1_L4_2 -2048,0, 0,0 - KERNEL8x16_I1_L4_2 -2048,0, 1,0 - KERNEL8x16_I1_L4_2 -2048,0, 2,0 - KERNEL8x16_I1_L4_2 -2048,0, 3,0 - KERNEL8x16_I1_L4_2 -2048,0, 4,0 - KERNEL8x16_I1_L4_2 -2048,0, 5,0 - KERNEL8x16_I1_L4_2 -2048,0, 6,0 - KERNEL8x16_I1_L4_2 -2048,0, 7,0 - KERNEL8x16_I1_L4_2 -2048,0, 8,0 - KERNEL8x16_I1_L4_2 -2048,0, 9,0 - KERNEL8x16_I1_L4_2 -2048,0, 10,0 - KERNEL8x16_I1_L4_2 -2048,0, 11,0 - KERNEL8x16_I1_L4_2 -2048,0, 12,0 - KERNEL8x16_I1_L4_2 -2048,0, 13,0 - KERNEL8x16_I1_L4_2 -2048,0, 14,0 - KERNEL8x16_I1_L4_2 -2048,0, 15,1 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_2 64,32, 15,1 bdnz LSGEMM_L8x16_LOOP MY_ALIGN LSGEMM_L8x16_LOOP_END: - END8x16 0, AO, BO, -2048, 0 + END8x16 0, AO, BO, 64, 32 b LSGEMM_L8x16_SUB1 MY_ALIGN diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S new file mode 100644 index 000000000..e655f0bfe --- /dev/null +++ b/kernel/power/zgemm_kernel_power9.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 32192 + +#define FZERO 312+192(SP) + + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define BBUFFER r14 + +#define L r15 +#define ALPHA r16 +#define T5 r17 +#define T2 r19 +#define BBO r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + stw r0, FZERO + +#ifdef linux + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power9.S" + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li o8 , 8 + li o16 , 16 + + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 + + + addi ALPHA, SP, 296+192 + + xxlor alpha_r,vs1,vs1 /*copy from register f1 */ + xxlor alpha_i,vs2,vs2 /*copy from register f2 */ + + .align 4 + +#include "zgemm_logic_power9.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S new file mode 100644 index 000000000..77ce36294 --- /dev/null +++ b/kernel/power/zgemm_logic_power9.S @@ -0,0 +1,857 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 + + srawi. J, N, 1 + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + srawi. T1, K, 2 + ble ZGEMM_L2_COPYB1 + +ZGEMM_L2_COPYB8: + + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB8 + +ZGEMM_L2_COPYB1: + + andi. T1, K, 3 + ble ZGEMM_L2_COPYB_END + +ZGEMM_L2_COPYB_LOOP: + + ZCOPYB_2 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB_LOOP + +ZGEMM_L2_COPYB_END: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble ZGEMM_L2x8_END + +ZGEMM_L2x8_BEGIN: + + + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 32x */ + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + + +ZGEMM_L2x8_LOOP_START: + + LOAD2x8 0 + li T2, 1024 + li T3, 1024+512 + li T4, 2048 + li T5, 2048+512 + mtctr L + + MY_ALIGN +ZGEMM_L2x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,64,0,0 + KERNEL2x8_L 128,64,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,64,2,0 + KERNEL2x8_L 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,64,4,0 + KERNEL2x8_L 128,64,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,64,6,0 + KERNEL2x8_L 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,64,8,0 + KERNEL2x8_L 128,64,9,0 + KERNEL2x8_L 128,64,10,0 + KERNEL2x8_L 128,64,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,64,12,0 + KERNEL2x8_L 128,64,13,0 + KERNEL2x8_L 128,64,14,0 + KERNEL2x8_L 128,64,15,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: + END2x8 AO, BO, 128, 64 + + b ZGEMM_L2x8_SUB1 + +ZGEMM_L2x8_SUB0: + + andi. L, K, 63 + + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x8_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x8_SUB2_LOOP: + LOAD2x8 0 + KERNEL2x8_L 128,64, 0,0 + KERNEL2x8_L 128,64, 1,0 + KERNEL2x8_L 128,64, 2,0 + KERNEL2x8_E 128,64, 3,1 + bdnz ZGEMM_L2x8_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x8_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + LOAD2x8 0 + KERNEL2x8_L 128,64, 0,0 + KERNEL2x8_E 128,64, 1,1 + MY_ALIGN +ZGEMM_L2x8_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + LOAD2x8 0 + KERNEL2x8_E 128,64, 0,1 + MY_ALIGN +ZGEMM_L2x8_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + +/* addic. L, L, -1 + bgt ZGEMM_L2x8_SUB2_1*/ + +ZGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_END: + +ZGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x4 + ble ZGEMM_L2x4_SUB0 + +ZGEMM_L2x4_LOOP_START: + LOAD2x4 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x4_LOOP: + KERNEL2x4_L 64,64,0,0 + KERNEL2x4_L 64,64,1,0 + KERNEL2x4_L 64,64,2,0 + KERNEL2x4_L 64,64,3,0 + KERNEL2x4_L 64,64,4,0 + KERNEL2x4_L 64,64,5,0 + KERNEL2x4_L 64,64,6,0 + KERNEL2x4_L 64,64,7,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: + END2x4 AO, BO, 64, 64 + + b ZGEMM_L2x4_SUB1 + +ZGEMM_L2x4_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x4_SUB2 + +ZGEMM_L2x4_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x4_SAVE + +ZGEMM_L2x4_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x4_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x4_SUB2_LOOP: + LOAD2x4 0 + KERNEL2x4_L 64,64, 0,0 + KERNEL2x4_L 64,64, 1,0 + KERNEL2x4_L 64,64, 2,0 + KERNEL2x4_E 64,64, 3,1 + bdnz ZGEMM_L2x4_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x4_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + LOAD2x4 0 + KERNEL2x4_L 64,64, 0,0 + KERNEL2x4_E 64,64, 1,1 + MY_ALIGN +ZGEMM_L2x4_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + LOAD2x4 0 + KERNEL2x4_E 64,64, 0,1 + MY_ALIGN +ZGEMM_L2x4_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + +ZGEMM_L2x4_SAVE: + + SAVE2x4 + +ZGEMM_L2x4_END: + +ZGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L2x2_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + +ZGEMM_L2x2_LOOP_START: + LOAD2x2 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x2_LOOP: + KERNEL2x2_L 32,64,0,0 + KERNEL2x2_L 32,64,1,0 + KERNEL2x2_L 32,64,2,0 + KERNEL2x2_L 32,64,3,0 + KERNEL2x2_L 32,64,4,0 + KERNEL2x2_L 32,64,5,0 + KERNEL2x2_L 32,64,6,0 + KERNEL2x2_L 32,64,7,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN +ZGEMM_L2x2_LOOP_END: + END2x2 AO, BO, 32, 64 + + b ZGEMM_L2x2_SUB1 + +ZGEMM_L2x2_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x2_SUB2 + +ZGEMM_L2x2_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x2_SAVE + +ZGEMM_L2x2_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x2_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x2_SUB2_LOOP: + LOAD2x2 0 + KERNEL2x2_L 32,64, 0,0 + KERNEL2x2_L 32,64, 1,0 + KERNEL2x2_L 32,64, 2,0 + KERNEL2x2_E 32,64, 3,1 + bdnz ZGEMM_L2x2_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x2_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + LOAD2x2 0 + KERNEL2x2_L 32,64, 0,0 + KERNEL2x2_E 32,64, 1,1 + MY_ALIGN +ZGEMM_L2x2_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + LOAD2x2 0 + KERNEL2x2_E 32,64, 0,1 + MY_ALIGN +ZGEMM_L2x2_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 +ZGEMM_L2x2_SAVE: + + SAVE2x2 + +ZGEMM_L2x2_END: + +ZGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L2x1_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 4 /**(K-1) % 16x */ + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + +ZGEMM_L2x1_LOOP_START: + + LOAD2x1 0 + mtctr L + + MY_ALIGN +ZGEMM_L2x1_LOOP: + KERNEL2x1_L 16,64,0,0 + KERNEL2x1_L 16,64,1,0 + KERNEL2x1_L 16,64,2,0 + KERNEL2x1_L 16,64,3,0 + KERNEL2x1_L 16,64,4,0 + KERNEL2x1_L 16,64,5,0 + KERNEL2x1_L 16,64,6,0 + KERNEL2x1_L 16,64,7,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: + END2x1 AO, BO, 16, 64 + + b ZGEMM_L2x1_SUB1 + +ZGEMM_L2x1_SUB0: + + andi. L, K, 31 + + b ZGEMM_L2x1_SUB2 + +ZGEMM_L2x1_SUB1: + + andi. L, T1, 15 + ble ZGEMM_L2x1_SAVE + +ZGEMM_L2x1_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L2x1_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L2x1_SUB2_LOOP: + LOAD2x1 0 + KERNEL2x1_L 16,64, 0,0 + KERNEL2x1_L 16,64, 1,0 + KERNEL2x1_L 16,64, 2,0 + KERNEL2x1_E 16,64, 3,1 + bdnz ZGEMM_L2x1_SUB2_LOOP + MY_ALIGN +ZGEMM_L2x1_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1 0 + KERNEL2x1_L 16,64, 0,0 + KERNEL2x1_E 16,64, 1,1 + MY_ALIGN +ZGEMM_L2x1_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1 0 + KERNEL2x1_E 16,64, 0,1 + MY_ALIGN +ZGEMM_L2x1_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + +ZGEMM_L2x1_SAVE: + + SAVE2x1 + +ZGEMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt ZGEMM_L2_BEGIN + + andi. T2, N, 1 + ble L999 + +ZGEMM_L2_END: + + b ZGEMM_L1_BEGIN + +L999_H1: + + b L999 + +ZGEMM_L1_BEGIN: + andi. T1, N, 1 + ble ZGEMM_L1_END + + mr BO, B + mr BBO, BBUFFER + srawi. T1, K, 3 /*this time K/8 */ + ble ZGEMM_L1_COPYB1 + +ZGEMM_L1_COPYB8: + + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8 + addic. T1, T1, -1 + + bgt ZGEMM_L1_COPYB8 + +ZGEMM_L1_COPYB1: + + andi. T1, K, 7 + ble ZGEMM_L1_COPYB_END + +ZGEMM_L1_COPYB_LOOP: + + ZCOPYB_1 + addic. T1, T1, -1 + + bgt ZGEMM_L1_COPYB_LOOP + +ZGEMM_L1_COPYB_END: + + mr CO, C + mr AO, A + srawi. I, M, 3 + ble ZGEMM_L1x8_END + +ZGEMM_L1x8_BEGIN: + + + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 32x */ + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + + +ZGEMM_L1x8_LOOP_START: + + LOAD1x8 0 + li T2, 1024 + li T3, 1024+512 + li T4, 2048 + li T5, 2048+512 + mtctr L + + MY_ALIGN +ZGEMM_L1x8_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L 128,32,0,0 + KERNEL1x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL1x8_L 128,32,2,0 + KERNEL1x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L 128,32,4,0 + KERNEL1x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL1x8_L 128,32,6,0 + KERNEL1x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L 128,32,8,0 + KERNEL1x8_L 128,32,9,0 + KERNEL1x8_L 128,32,10,0 + KERNEL1x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL1x8_L 128,32,12,0 + KERNEL1x8_L 128,32,13,0 + KERNEL1x8_L 128,32,14,0 + KERNEL1x8_L 128,32,15,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: + END1x8 AO, BO, 128, 32 + + b ZGEMM_L1x8_SUB1 + +ZGEMM_L1x8_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x8_SUB2 + +ZGEMM_L1x8_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x8_SAVE + +ZGEMM_L1x8_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x8_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x8_SUB2_LOOP: + LOAD1x8 0 + KERNEL1x8_L 128,32, 0,0 + KERNEL1x8_L 128,32, 1,0 + KERNEL1x8_L 128,32, 2,0 + KERNEL1x8_E 128,32, 3,1 + bdnz ZGEMM_L1x8_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x8_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + LOAD1x8 0 + KERNEL1x8_L 128,32, 0,0 + KERNEL1x8_E 128,32, 1,1 + MY_ALIGN +ZGEMM_L1x8_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + LOAD1x8 0 + KERNEL1x8_E 128,32, 0,1 + MY_ALIGN +ZGEMM_L1x8_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + +/* addic. L, L, -1 + bgt ZGEMM_L1x8_SUB2_1*/ + +ZGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt ZGEMM_L1x8_BEGIN + +ZGEMM_L1x8_END: + +ZGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L1x1_END + + andi. T1, M, 4 + ble ZGEMM_L1x4_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + +ZGEMM_L1x4_LOOP_START: + LOAD1x4 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x4_LOOP: + KERNEL1x4_L 64,32,0,0 + KERNEL1x4_L 64,32,1,0 + KERNEL1x4_L 64,32,2,0 + KERNEL1x4_L 64,32,3,0 + KERNEL1x4_L 64,32,4,0 + KERNEL1x4_L 64,32,5,0 + KERNEL1x4_L 64,32,6,0 + KERNEL1x4_L 64,32,7,0 + KERNEL1x4_L 64,32,8,0 + KERNEL1x4_L 64,32,9,0 + KERNEL1x4_L 64,32,10,0 + KERNEL1x4_L 64,32,11,0 + KERNEL1x4_L 64,32,12,0 + KERNEL1x4_L 64,32,13,0 + KERNEL1x4_L 64,32,14,0 + KERNEL1x4_L 64,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN +ZGEMM_L1x4_LOOP_END: + END1x4 AO, BO, 64, 32 + + b ZGEMM_L1x4_SUB1 + +ZGEMM_L1x4_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x4_SUB2 + +ZGEMM_L1x4_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + +ZGEMM_L1x4_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x4_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x4_SUB2_LOOP: + LOAD1x4 0 + KERNEL1x4_L 64,32, 0,0 + KERNEL1x4_L 64,32, 1,0 + KERNEL1x4_L 64,32, 2,0 + KERNEL1x4_E 64,32, 3,1 + bdnz ZGEMM_L1x4_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x4_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + LOAD1x4 0 + KERNEL1x4_L 64,32, 0,0 + KERNEL1x4_E 64,32, 1,1 + MY_ALIGN +ZGEMM_L1x4_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + LOAD1x4 0 + KERNEL1x4_E 64,32, 0,1 + MY_ALIGN +ZGEMM_L1x4_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + +ZGEMM_L1x4_SAVE: + + SAVE1x4 + +ZGEMM_L1x4_END: + +ZGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L1x2_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + +ZGEMM_L1x2_LOOP_START: + LOAD1x2 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x2_LOOP: + KERNEL1x2_L 32,32,0,0 + KERNEL1x2_L 32,32,1,0 + KERNEL1x2_L 32,32,2,0 + KERNEL1x2_L 32,32,3,0 + KERNEL1x2_L 32,32,4,0 + KERNEL1x2_L 32,32,5,0 + KERNEL1x2_L 32,32,6,0 + KERNEL1x2_L 32,32,7,0 + KERNEL1x2_L 32,32,8,0 + KERNEL1x2_L 32,32,9,0 + KERNEL1x2_L 32,32,10,0 + KERNEL1x2_L 32,32,11,0 + KERNEL1x2_L 32,32,12,0 + KERNEL1x2_L 32,32,13,0 + KERNEL1x2_L 32,32,14,0 + KERNEL1x2_L 32,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN +ZGEMM_L1x2_LOOP_END: + END1x2 AO, BO, 32, 32 + + b ZGEMM_L1x2_SUB1 + +ZGEMM_L1x2_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x2_SUB2 + +ZGEMM_L1x2_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + +ZGEMM_L1x2_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x2_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x2_SUB2_LOOP: + LOAD1x2 0 + KERNEL1x2_L 32,32, 0,0 + KERNEL1x2_L 32,32, 1,0 + KERNEL1x2_L 32,32, 2,0 + KERNEL1x2_E 32,32, 3,1 + bdnz ZGEMM_L1x2_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x2_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + LOAD1x2 0 + KERNEL1x2_L 32,32, 0,0 + KERNEL1x2_E 32,32, 1,1 + MY_ALIGN +ZGEMM_L1x2_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + LOAD1x2 0 + KERNEL1x2_E 32,32, 0,1 + MY_ALIGN +ZGEMM_L1x2_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 +ZGEMM_L1x2_SAVE: + + SAVE1x2 + +ZGEMM_L1x2_END: + +ZGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L1x1_END + mr BO, BBUFFER + mr T1, K + addi T1,T1, -1 + srawi. L, T1, 5 /**(K-1) % 16x */ + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + +ZGEMM_L1x1_LOOP_START: + + LOAD1x1 0 + mtctr L + + MY_ALIGN +ZGEMM_L1x1_LOOP: + KERNEL1x1_L 16,32,0,0 + KERNEL1x1_L 16,32,1,0 + KERNEL1x1_L 16,32,2,0 + KERNEL1x1_L 16,32,3,0 + KERNEL1x1_L 16,32,4,0 + KERNEL1x1_L 16,32,5,0 + KERNEL1x1_L 16,32,6,0 + KERNEL1x1_L 16,32,7,0 + KERNEL1x1_L 16,32,8,0 + KERNEL1x1_L 16,32,9,0 + KERNEL1x1_L 16,32,10,0 + KERNEL1x1_L 16,32,11,0 + KERNEL1x1_L 16,32,12,0 + KERNEL1x1_L 16,32,13,0 + KERNEL1x1_L 16,32,14,0 + KERNEL1x1_L 16,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN +ZGEMM_L1x1_LOOP_END: + END1x1 AO, BO, 16, 32 + + b ZGEMM_L1x1_SUB1 + +ZGEMM_L1x1_SUB0: + + andi. L, K, 63 + + b ZGEMM_L1x1_SUB2 + +ZGEMM_L1x1_SUB1: + + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + +ZGEMM_L1x1_SUB2: + srawi. T1,L, 3 + ble ZGEMM_L1x1_SUB2_4 + mtctr T1 + MY_ALIGN +ZGEMM_L1x1_SUB2_LOOP: + LOAD1x1 0 + KERNEL1x1_L 16,32, 0,0 + KERNEL1x1_L 16,32, 1,0 + KERNEL1x1_L 16,32, 2,0 + KERNEL1x1_E 16,32, 3,1 + bdnz ZGEMM_L1x1_SUB2_LOOP + MY_ALIGN +ZGEMM_L1x1_SUB2_4: + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1 0 + KERNEL1x1_L 16,32, 0,0 + KERNEL1x1_E 16,32, 1,1 + MY_ALIGN +ZGEMM_L1x1_SUB2_2: + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1 0 + KERNEL1x1_E 16,32, 0,1 + MY_ALIGN +ZGEMM_L1x1_SUB2_1: + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + +ZGEMM_L1x1_SAVE: + + SAVE1x1 + +ZGEMM_L1x1_END: + +ZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S new file mode 100644 index 000000000..93a309ad1 --- /dev/null +++ b/kernel/power/zgemm_macros_power9.S @@ -0,0 +1,1664 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V + AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7 +.endm + +.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8 + xxlxor \TEMP1, \TEMP1, \TEMP1 + xxlxor \TEMP2, \TEMP2, \TEMP2 + + xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB + XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB + + xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB + xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB + + XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB + XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB + + xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i + xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r + xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r + xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i + + xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i + xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r + xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD2x8 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A + +.if \Zero==1 + Zero2x8 +.endif + +.endm + +.macro END2x8_NORMAL + END2x8 AO,BO,128,64 +.endm + +.macro END2x8 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x8 + LOAD2x8 0 + END2x8 AO, BO, 128,64 +.endm + +.macro SAVE2x8 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs48,vs49,vs8 + AGGREGATE_INTO_COMPLEX vs50,vs51,vs9 + AGGREGATE_INTO_COMPLEX vs52,vs53,vs10 + AGGREGATE_INTO_COMPLEX vs54,vs55,vs11 + AGGREGATE_INTO_COMPLEX vs56,vs57,vs12 + AGGREGATE_INTO_COMPLEX vs58,vs59,vs13 + AGGREGATE_INTO_COMPLEX vs60,vs61,vs14 + AGGREGATE_INTO_COMPLEX vs62,vs63,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + addi CO, CO, 128 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD2x4 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + +.if \Zero==1 + Zero2x4 +.endif + +.endm + +.macro END2x4_NORMAL + END2x4 AO,BO,64,64 +.endm + +.macro END2x4 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x4 + LOAD2x4 0 + END2x4 AO, BO, 64,64 +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs40,vs41,vs8 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs9 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs10 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + addi CO, CO, 64 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + +.macro LOAD2x2 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + +.if \Zero==1 + Zero2x2 +.endif + +.endm + +.macro END2x2_NORMAL + END2x2 AO,BO,32,64 +.endm + +.macro END2x2 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x2 + LOAD2x2 0 + END2x2 AO, BO, 32,64 +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs36,vs37,vs8 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + + addi CO, CO, 32 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 +.endm + +.macro LOAD2x1 Zero + lxv vs0, 0(AO) // load real,imag from A + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + lxv vs18, 32(BO) // load real part from B + lxv vs19, 48(BO) // load imag part from B + +.if \Zero==1 + Zero2x1 +.endif + +.endm + +.macro END2x1_NORMAL + END2x1 AO,BO,16,64 +.endm + +.macro END2x1 AREG, BREG, OffsetA, OffsetB + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + +.endm + +.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B + lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B + lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B + lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B + lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP8(\Index,128) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL2x1 + LOAD2x1 0 + END2x1 AO, BO, 16,64 +.endm + +.macro SAVE2x1 + + mr T1, CO +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + + AGGREGATE_INTO_COMPLEX vs34,vs35,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + + addi CO, CO, 16 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD1x8 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A + +.if \Zero==1 + Zero1x8 +.endif + +.endm + +.macro END1x8_NORMAL + END1x8 AO,BO,128,32 +.endm + +.macro END1x8 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + +.endm + +.macro KERNEL1x8 + LOAD1x8 0 + END1x8 AO, BO, 128,32 +.endm + +.macro SAVE1x8 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + lxv vs20, 0(T2) + lxv vs21, 16(T2) + lxv vs22, 32(T2) + lxv vs23, 48(T2) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 + AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 + AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 + AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + stxv vs12, 0(T2) + stxv vs13, 16(T2) + stxv vs14, 32(T2) + stxv vs15, 48(T2) + + addi CO, CO, 128 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + +.macro LOAD1x4 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A + +.if \Zero==1 + Zero1x4 +.endif + +.endm + +.macro END1x4_NORMAL + END1x4 AO,BO,64,32 +.endm + +.macro END1x4 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + +.endm + +.macro KERNEL1x4 + LOAD1x4 0 + END1x4 AO, BO, 64,32 +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + lxv vs18, 32(T1) + lxv vs19, 48(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 + AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + stxv vs10, 32(T1) + stxv vs11, 48(T1) + + addi CO, CO, 64 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 +.endm + +.macro LOAD1x2 Zero + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + +.if \Zero==1 + Zero1x2 +.endif + +.endm + +.macro END1x2_NORMAL + END1x2 AO,BO,32,32 +.endm + +.macro END1x2 AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + +.endm + +.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + +lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + +.endm + +.macro KERNEL1x2 + LOAD1x2 0 + END1x2 AO, BO, 32,32 +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxv vs16, 0(T1) + lxv vs17, 16(T1) + +#endif + + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxv vs8, 0(T1) + stxv vs9, 16(T1) + +addi CO, CO, 32 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 +.endm + +.macro LOAD1x1 Zero + lxv vs0, 0(AO) // load real,imag from A + + lxv vs16, 0(BO) // load real part from B + lxv vs17, 16(BO) // load imag part from B + +.if \Zero==1 + Zero1x1 +.endif + +.endm + +.macro END1x1_NORMAL + END1x1 AO,BO,16,32 +.endm + +.macro END1x1 AREG, BREG, OffsetA, OffsetB + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B + lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + +.if \Complete==0 + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B + lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1 + LOAD1x1 0 + END1x1 AO, BO, 16,32 + +.endm + +.macro SAVE1x1 + + mr T1, CO +#ifndef TRMMKERNEL + lxv vs16, 0(T1) +#endif + AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 + +#ifndef TRMMKERNEL + xvadddp vs8, vs8, vs16 +#endif + + stxv vs8, 0(T1) + +addi CO, CO, 16 + +.endm + + +.macro ZCOPYB_2 + + lxv vs32, 0(BO) + lxv vs33, 16(BO) + addi BO, BO, 32 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + xxspltd vs42, vs33, 1 + xxspltd vs43, vs33, 0 + + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + stxv vs42, 32(BBO) + stxv vs43, 48(BBO) + addi BBO, BBO, 64 + +.endm + +.macro ZCOPYB_1 + + lxv vs32, 0(BO) + addi BO, BO, 16 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + + addi BBO, BBO, 32 + +.endm + +.macro ZCOPYB_8 + + lxv vs32, 0(BO) + lxv vs33, 16(BO) + lxv vs34, 32(BO) + lxv vs35, 48(BO) + + lxv vs36, 64+0(BO) + lxv vs37, 64+16(BO) + lxv vs38, 64+32(BO) + lxv vs39, 64+48(BO) + addi BO, BO, 128 + xxspltd vs40, vs32, 1 + xxspltd vs41, vs32, 0 + xxspltd vs42, vs33, 1 + xxspltd vs43, vs33, 0 + xxspltd vs44, vs34, 1 + xxspltd vs45, vs34, 0 + xxspltd vs46, vs35, 1 + xxspltd vs47, vs35, 0 + + xxspltd vs48, vs36, 1 + xxspltd vs49, vs36, 0 + xxspltd vs50, vs37, 1 + xxspltd vs51, vs37, 0 + xxspltd vs52, vs38, 1 + xxspltd vs53, vs38, 0 + xxspltd vs54, vs39, 1 + xxspltd vs55, vs39, 0 + + stxv vs40, 0(BBO) + stxv vs41, 16(BBO) + stxv vs42, 32(BBO) + stxv vs43, 48(BBO) + + stxv vs44, 64+0(BBO) + stxv vs45, 64+16(BBO) + stxv vs46, 64+32(BBO) + stxv vs47, 64+48(BBO) + + stxv vs48, 128+ 0(BBO) + stxv vs49, 128+ 16(BBO) + stxv vs50, 128+ 32(BBO) + stxv vs51, 128+ 48(BBO) + + stxv vs52, 192 + 0(BBO) + stxv vs53, 192 + 16(BBO) + stxv vs54, 192+ 32(BBO) + stxv vs55, 192 + 48(BBO) + addi BBO, BBO, 256 + +.endm + diff --git a/param.h b/param.h index 4dcd96a75..d0b8518c9 100644 --- a/param.h +++ b/param.h @@ -2251,12 +2251,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 640 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 320 +#define ZGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_Q 1408 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 1152 #define SYMV_P 8 From c00289ba543121a78c5ab07a8e45385cc12fb9a8 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 1 Jun 2019 21:30:06 +0200 Subject: [PATCH 610/935] upload thread safety test folder --- cpp_thread_test/Makefile | 14 +++ cpp_thread_test/cpp_thread_safety_common.h | 55 +++++++++++ cpp_thread_test/dgemm_thread_safety.cpp | 92 +++++++++++++++++++ cpp_thread_test/dgemv_thread_safety.cpp | 101 +++++++++++++++++++++ 4 files changed, 262 insertions(+) create mode 100644 cpp_thread_test/Makefile create mode 100644 cpp_thread_test/cpp_thread_safety_common.h create mode 100644 cpp_thread_test/dgemm_thread_safety.cpp create mode 100644 cpp_thread_test/dgemv_thread_safety.cpp diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile new file mode 100644 index 000000000..81e3470ef --- /dev/null +++ b/cpp_thread_test/Makefile @@ -0,0 +1,14 @@ +include ../Makefile.rule + +all :: dgemv_tester dgemm_tester + +dgemv_tester : + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester + ./dgemv_tester + +dgemm_tester : dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester + ./dgemm_tester + +clean :: + rm -f dgemv_tester dgemm_tester diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h new file mode 100644 index 000000000..60ab5bb2f --- /dev/null +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -0,0 +1,55 @@ +inline void pauser(){ + /// a portable way to pause a program + std::string dummy; + std::cout << "Press enter to continue..."; + std::getline(std::cin, dummy); +} + +void FillMatrices(std::vector>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ + for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ + matBlock[i][j] = rngdist(PRNG); + } + } + for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){ + for(uint32_t j=0; j>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){ + for(uint32_t i=0; i(randomMatSize); j++){ + vecBlock[i][j] = rngdist(PRNG); + } + } + for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){ + for(uint32_t j=0; j rngdist{-1.0, 1.0}; + //make sure the internal state of the PRNG is properly mixed by generating 10M random numbers + //PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed + for (uint32_t i=0;i<10000000;i++) rngdist(PRNG); + return PRNG; +} + +void PrintMatrices(const std::vector>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ + for (uint32_t i=0;i(randomMatSize); j++){ + for (uint32_t k = 0; k < static_cast(randomMatSize); k++){ + std::cout< +#include +#include +#include +#include +#include "../cblas.h" +#include "cpp_thread_safety_common.h" + +void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){ + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize); +} + +int main(int argc, char* argv[]){ + blasint randomMatSize = 1024; //dimension of the random square matrices used + uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numTestRounds = 16; //number of testing rounds before success exit + + if (argc > 4){ + std::cout<<"ERROR: too many arguments for thread safety tester"< cliArgs; + for (int i = 1; i < argc; i++){ + cliArgs.push_back(argv[i]); + std::cout< rngdist{-1.0, 1.0}; + std::vector> matBlock(numConcurrentThreads*3); + std::vector> futureBlock(numConcurrentThreads); + + std::cout<<"*----------------------------*\n"; + std::cout<<"| DGEMM thread safety tester |\n"; + std::cout<<"*----------------------------*\n"; + std::cout<<"Size of random matrices(N=M=K): "<(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize*randomMatSize); j++){ + if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread + std::cout<<"ERROR: one of the threads returned a different result! Index : "< +#include +#include +#include +#include +#include "../cblas.h" +#include "cpp_thread_safety_common.h" + +void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){ + const blasint inc = 1; + cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc); +} + +int main(int argc, char* argv[]){ + blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used + uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numTestRounds = 16; //number of testing rounds before success exit + + if (argc > 4){ + std::cout<<"ERROR: too many arguments for thread safety tester"< cliArgs; + for (int i = 1; i < argc; i++){ + cliArgs.push_back(argv[i]); + std::cout< rngdist{-1.0, 1.0}; + std::vector> matBlock(numConcurrentThreads); + std::vector> vecBlock(numConcurrentThreads*2); + std::vector> futureBlock(numConcurrentThreads); + + std::cout<<"*----------------------------*\n"; + std::cout<<"| DGEMV thread safety tester |\n"; + std::cout<<"*----------------------------*\n"; + std::cout<<"Size of random matrices and vectors(N=M): "<(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast(randomMatSize)*numConcurrentThreads*8*2))/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize); j++){ + if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread + std::cout<<"ERROR: one of the threads returned a different result! Index : "< Date: Sat, 1 Jun 2019 21:32:52 +0200 Subject: [PATCH 611/935] hook up c++ thread safety test (main Makefile) --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 21096f893..20ef1e868 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench +SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test .PHONY : all libs netlib $(RELA) test ctest shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test @@ -127,6 +127,9 @@ ifndef NO_FBLAS endif ifndef NO_CBLAS $(MAKE) -C ctest all +ifeq ($(CPP_THREAD_SAFETY_TEST), 1) + $(MAKE) -C cpp_thread_test all +endif endif endif From 16f3df5d3551ff705d5d23dcdf26853114fb6956 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 1 Jun 2019 21:36:41 +0200 Subject: [PATCH 612/935] add c++ thread test option to Makefile.rule --- Makefile.rule | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 7c128fb49..209934991 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -220,6 +220,21 @@ COMMON_PROF = -pg # SYMBOLPREFIX= # SYMBOLSUFFIX= +# Run a C++ based thread safety tester after the build is done. +# This is mostly intended as a developer feature to spot regressions, but users and +# package maintainers can enable this if they have doubts about the thread safety of +# the library, given the configuration in this file. +# By default, the thread safety tester launches 52 concurrent calculations at the same +# time. +# +# Please note that the test uses ~1300 MiB of RAM for the DGEMM test. +# +# The test requires CBLAS to be built, a C++11 capable compiler and the presence of +# an OpenMP implementation. If you are cross-compiling this test will probably not +# work at all. +# +# CPP_THREAD_SAFETY_TEST = 1 + # # End of user configuration # From 27649b95430cbed40923db4ab45119af6b05acb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Jun 2019 11:01:33 +0200 Subject: [PATCH 613/935] Document NO_AVX512 for #2151 --- Makefile.rule | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 255d1da46..65d04ee3e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -163,6 +163,10 @@ NO_AFFINITY = 1 # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) # NO_AVX2 = 1 +# Don't use SkylakeX optimizations if binutils or compiler are too old (the build +# system will try to determine this automatically) +# NO_AVX512 = 1 + # Don't use parallel make. # NO_PARALLEL_MAKE = 1 From a469b32cf43772bb14253a405be8f088ce3a9d83 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Fri, 31 May 2019 22:48:16 +0000 Subject: [PATCH 614/935] sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52 --- benchmark/gemm.c | 2 +- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/dgemm_kernel_power9.S | 48 +- kernel/power/sgemm_kernel_power9.S | 140 +- kernel/power/sgemm_logic_power9.S | 192 ++- kernel/power/sgemm_macros_power9.S | 861 ++++------ kernel/power/zgemm_kernel_power9.S | 116 +- kernel/power/zgemm_logic_power9.S | 786 ++++++---- kernel/power/zgemm_macros_power9.S | 2333 +++++++++++++--------------- param.h | 8 +- 10 files changed, 2067 insertions(+), 2421 deletions(-) diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 85bcbc710..dd016a7c3 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - + fprintf(stderr, " SIZE Flops Time\n"); for (i = from; i <= to; i += step) { diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 5c10ad64a..440eaab1b 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index a1762dcf2..2fb1b27ef 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) stfd f1, ALPHA_SP @@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r15, 272(SP) ld r14, 280(SP) - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) addi SP, SP, STACKSIZE blr diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index f408cdc17..7a0f3143e 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD ld #define STACKSIZE (512 ) - +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ #define M r3 #define N r4 #define K r5 @@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROFCODE addi SP, SP, -STACKSIZE - li r0, 0 + mflr r0 + stfd f14, 0(SP) stfd f15, 8(SP) @@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) - + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) #if defined(TRMMKERNEL) @@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif slwi LDC, LDC, 2 - -/* cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 -*/ /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xxspltw alpha_r,alpha_r,0 - + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 /*load reverse permute mask for big endian uint128 = 0xc0d0e0f08090a0b0405060700010203 */ lis T2, perm_const2@highest - ori T2, T2, perm_const2@higher - rldicr T2, T2, 32, 31 - oris T2, T2, perm_const2@h - ori T2, T2, perm_const2@l - lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + lis T5, save_permute_22@highest + lis T6, save_permute_21@highest + ori T2, T2, perm_const2@higher ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + ori T5, T5, save_permute_22@higher + ori T6, T6, save_permute_21@higher + rldicr T2, T2, 32, 31 rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + rldicr T5, T5, 32, 31 + rldicr T6, T6, 32, 31 + oris T2, T2, perm_const2@h oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + oris T5, T5, save_permute_22@h + oris T6, T6, save_permute_21@h + ori T2, T2, perm_const2@l ori T1, T1, perm_const1@l - + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + ori T5, T5, save_permute_22@l + ori T6, T6, save_permute_21@l + li r0,0 mtvsrdd permute_mask,T2,T1 - - lis T2, save_permute_12@highest - ori T2, T2, save_permute_12@higher - rldicr T2, T2, 32, 31 - oris T2, T2, save_permute_12@h - ori T2, T2, save_permute_12@l - - lis T1, save_permute_11@highest - ori T1, T1, save_permute_11@higher - rldicr T1, T1, 32, 31 - oris T1, T1, save_permute_11@h - ori T1, T1, save_permute_11@l - - mtvsrdd save_permute_1,T2,T1 - - lis T2, save_permute_22@highest - ori T2, T2, save_permute_22@higher - rldicr T2, T2, 32, 31 - oris T2, T2, save_permute_22@h - ori T2, T2, save_permute_22@l - - lis T1, save_permute_21@highest - ori T1, T1, save_permute_21@higher - rldicr T1, T1, 32, 31 - oris T1, T1, save_permute_21@h - ori T1, T1, save_permute_21@l - - mtvsrdd save_permute_2,T2,T1 + mtvsrdd save_permute_1,T3,T4 + mtvsrdd save_permute_2,T5,T6 #include "sgemm_logic_power9.S" -.L999: - addi r3, 0, 0 - +.L999: lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) @@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) - - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) + ld r0, FLINK_SAVE(SP) - addi SP, SP, STACKSIZE + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE blr + EPILOGUE #endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index c149cb903..25e8c8387 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -1,5 +1,94 @@ #define MY_ALIGN .align 3 +b L8 + MY_ALIGN +LSGEMM_L8x16_LMAIN_SUB: + LOAD8x16_0 + mtctr L + MY_ALIGN + +LSGEMM_L8x16_LOOP: + + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_2 64,32, 15,0 + KERNEL8x16_I1_L4_2 64,32, 16,0 + KERNEL8x16_I1_L4_2 64,32, 17,0 + KERNEL8x16_I1_L4_2 64,32, 18,0 + KERNEL8x16_I1_L4_2 64,32, 19,0 + KERNEL8x16_I1_L4_2 64,32, 20,0 + KERNEL8x16_I1_L4_2 64,32, 21,0 + KERNEL8x16_I1_L4_2 64,32, 22,0 + KERNEL8x16_I1_L4_2 64,32, 23,0 + KERNEL8x16_I1_L4_2 64,32, 24,0 + KERNEL8x16_I1_L4_2 64,32, 25,0 + KERNEL8x16_I1_L4_2 64,32, 26,0 + KERNEL8x16_I1_L4_2 64,32, 27,0 + KERNEL8x16_I1_L4_2 64,32, 28,0 + KERNEL8x16_I1_L4_2 64,32, 29,0 + KERNEL8x16_I1_L4_2 64,32, 30,0 + KERNEL8x16_I1_L4_2 64,32, 31,1 + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + END8x16 0, AO, BO, 64, 32 + blr + + MY_ALIGN +LSGEMM_L8x16_L64_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_2 64,32, 7,0 + KERNEL8x16_I1_L4_2 64,32, 8,0 + KERNEL8x16_I1_L4_2 64,32, 9,0 + KERNEL8x16_I1_L4_2 64,32, 10,0 + KERNEL8x16_I1_L4_2 64,32, 11,0 + KERNEL8x16_I1_L4_2 64,32, 12,0 + KERNEL8x16_I1_L4_2 64,32, 13,0 + KERNEL8x16_I1_L4_2 64,32, 14,0 + KERNEL8x16_I1_L4_3 64,32, 15,1 + blr +LSGEMM_L8x16_L32_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_2 64,32, 3,0 + KERNEL8x16_I1_L4_2 64,32, 4,0 + KERNEL8x16_I1_L4_2 64,32, 5,0 + KERNEL8x16_I1_L4_2 64,32, 6,0 + KERNEL8x16_I1_L4_3 64,32, 7,1 + blr + +LSGEMM_L8x16_L16_SUB: + LOAD8x16_0 + KERNEL8x16_I1_L4_2 64,32, 0,0 + KERNEL8x16_I1_L4_2 64,32, 1,0 + KERNEL8x16_I1_L4_2 64,32, 2,0 + KERNEL8x16_I1_L4_3 64,32, 3,1 + blr + +L8: #if defined(TRMMKERNEL) && !defined(LEFT) neg TEMP_REG, OFFSET #endif @@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN: REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 mr T12, T11 addi T12,T12, -1 - srawi. L, T12, 6 /**(T11-1) % 64x */ + srawi. L, T12, 7 /**(T11-1) % 128x */ #else mr T12, K addi T12,T12, -1 - srawi. L, T12, 6 /**(K-1) % 64x */ + srawi. L, T12, 7 /**(K-1) % 128x */ #endif ZERO8x16 ble LSGEMM_L8x16_SUB0 - - MY_ALIGN -LSGEMM_L8x16_LOOP_START: - - LOAD8x16_0 /*we already zeroed */ - /*##OffsetA=64 OffsetB=32 - #addi AO,AO,2112 - #addi BO,BO,32 */ - - mtctr L - - MY_ALIGN - -LSGEMM_L8x16_LOOP: - - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_2 64,32, 15,1 - - bdnz LSGEMM_L8x16_LOOP - - MY_ALIGN -LSGEMM_L8x16_LOOP_END: - - END8x16 0, AO, BO, 64, 32 - - b LSGEMM_L8x16_SUB1 + bl LSGEMM_L8x16_LMAIN_SUB + andi. L, T12, 127 + ble LSGEMM_L8x16_SAVE + b LSGEMM_L8x16_SUB2 MY_ALIGN LSGEMM_L8x16_SUB0: #if defined(TRMMKERNEL) - andi. L, T11, 127 + andi. L, T11, 255 + cmpwi T11,128 #else - andi. L, K, 127 + andi. L, K, 255 + cmpwi K,128 #endif - b LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 63 -#else - andi. L, T12, 63 -#endif - ble LSGEMM_L8x16_SAVE + + bne LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB2_128: + bl LSGEMM_L8x16_L64_SUB + bl LSGEMM_L8x16_L64_SUB + b LSGEMM_L8x16_SAVE MY_ALIGN LSGEMM_L8x16_SUB2: - - srawi. T10,L, 5 + andi. T10,L,64 + ble LSGEMM_L8x16_SUB2_32 + bl LSGEMM_L8x16_L64_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_32: + andi. T10,L, 32 ble LSGEMM_L8x16_SUB2_16 - mtctr T10 - MY_ALIGN -LSGEMM_L8x16_SUB2_LOOP: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_3 64,32, 7,1 - bdnz LSGEMM_L8x16_SUB2_LOOP - MY_ALIGN + bl LSGEMM_L8x16_L32_SUB + MY_ALIGN LSGEMM_L8x16_SUB2_16: andi. T10,L, 16 ble LSGEMM_L8x16_SUB2_8 - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_3 64,32, 3,1 + bl LSGEMM_L8x16_L16_SUB MY_ALIGN LSGEMM_L8x16_SUB2_8: andi. T10,L, 8 @@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1: andi. T10,L, 1 ble LSGEMM_L8x16_SAVE KERNEL8x16 0 -# addic. L, L, -1 -# bgt LSGEMM_L8x16_SUB2 + MY_ALIGN LSGEMM_L8x16_SAVE: diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index c61f419ac..3f86a1d25 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 + KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast @@ -112,15 +112,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxv vs24, 0(BO) lxv vs28, 16(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask lxv vs0, 0(AO) lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 - + lxv vs2, 32(AO) + lxv vs3, 48(AO) xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 @@ -259,247 +258,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 - - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP32(\Index,128) - addi \AREG, \AREG, DISP64(\Index,256) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 +KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete .endm @@ -509,224 +269,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. END8x16 \First, AO, BO, 64,32 .endm -.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) +.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs52, vs0,vs29 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs60, vs0,vs31 xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - xvmulsp vs50, vs2,vs28 - xvmulsp vs51, vs3,vs28 - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - xvmulsp vs54, vs2,vs29 - xvmulsp vs55, vs3,vs29 - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - xvmulsp vs58, vs2,vs30 - xvmulsp vs59, vs3,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - xvmulsp vs62, vs2,vs31 - xvmulsp vs63, vs3,vs31 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs37, vs1,vs25 -.else - xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) +.endif + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs58, vs2,vs30 xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) .endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs36, vs4,vs9 .if \Complete==0 lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif +.endif .if \IsLast==1 .if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) .else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) + addi \BREG, \BREG, DISP16(\Index,64) + .endif +.endif + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask .endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - xvmulsp vs34, vs6,vs8 - xvmulsp vs35, vs7,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - xvmulsp vs38, vs6,vs9 - xvmulsp vs39, vs7,vs9 -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs52, vs4,vs13 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 .endif + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs60, vs4,vs15 .if \Complete==0 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - xvmulsp vs42, vs6,vs10 - xvmulsp vs43, vs7,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - xvmulsp vs46, vs6,vs11 - xvmulsp vs47, vs7,vs11 - - xvmulsp vs48, vs4,vs12 - xvmulsp vs49, vs5,vs12 - xvmulsp vs50, vs6,vs12 - xvmulsp vs51, vs7,vs12 - - xvmulsp vs52, vs4,vs13 - xvmulsp vs53, vs5,vs13 - xvmulsp vs54, vs6,vs13 - xvmulsp vs55, vs7,vs13 - - xvmulsp vs56, vs4,vs14 - xvmulsp vs57, vs5,vs14 - xvmulsp vs58, vs6,vs14 - xvmulsp vs59, vs7,vs14 - - xvmulsp vs60, vs4,vs15 - xvmulsp vs61, vs5,vs15 - xvmulsp vs62, vs6,vs15 - xvmulsp vs63, vs7,vs15 + +.endif -.else - xvmaddasp vs40, vs4,vs10 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs37, vs5,vs9 xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 - - xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 - - xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 - - xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.endif + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs51, vs7,vs12 + xvmaddasp vs55, vs7,vs13 + xvmaddasp vs59, vs7,vs14 + xvmaddasp vs63, vs7,vs15 + .endm @@ -763,7 +433,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghw vs2, vs37, vs41 xxmrghw vs3, vs33, vs45 - +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) +#endif xxmrglw vs16, vs34, vs46 xxmrglw vs18, vs38, vs42 @@ -784,176 +457,203 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghw vs30, vs39, vs43 xxmrghw vs31, vs35, vs47 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) +#ifndef TRMMKERNEL lxv vs34, 32(CO) lxv vs35, 48(CO) #endif - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 #ifndef TRMMKERNEL lxv vs36, 0(T1) lxv vs37, 16(T1) +#endif + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL lxv vs38, 32(T1) lxv vs39, 48(T1) #endif + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + + + #ifndef TRMMKERNEL lxv vs40, 0(T2) lxv vs41, 16(T2) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 +#ifndef TRMMKERNEL lxv vs42, 32(T2) lxv vs43, 48(T2) #endif + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 #ifndef TRMMKERNEL lxv vs44, 0(T3) - lxv vs45, 16(T3) + lxv vs45, 16(T3) +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 +#ifndef TRMMKERNEL lxv vs46, 32(T3) lxv vs47, 48(T3) #endif - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 + + - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 - +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif xxperm vs24, vs30, save_permute_1 xxperm vs26, vs31, save_permute_1 + + + stxv vs32, 0(CO) + stxv vs33, 16(CO) +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r + stxv vs34, 32(CO) + stxv vs35, 48(CO) +#ifdef TRMMKERNEL xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T1) + stxv vs37, 16(T1) +#ifdef TRMMKERNEL xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r +#else xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif - - + stxv vs38, 32(T1) + stxv vs39, 48(T1) #ifdef TRMMKERNEL xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - + xvmulsp vs41, vs14, alpha_r +#else xvmaddasp vs40, vs10, alpha_r xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - -#endif - - stxv vs32, 0(CO) - stxv vs33, 16(CO) - stxv vs34, 32(CO) - stxv vs35, 48(CO) - - stxv vs36, 0(T1) - stxv vs37, 16(T1) - stxv vs38, 32(T1) - stxv vs39, 48(T1) +#endif stxv vs40, 0(T2) stxv vs41, 16(T2) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif stxv vs42, 32(T2) stxv vs43, 48(T2) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif stxv vs44, 0(T3) stxv vs45, 16(T3) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif stxv vs46, 32(T3) stxv vs47, 48(T3) /*****the same with the second 8X8 ****/ -#ifndef TRMMKERNEL - + #ifndef TRMMKERNEL lxv vs32, 0(T4) lxv vs33, 16(T4) - lxv vs34, 32(T4) - lxv vs35, 48(T4) - lxv vs36, 0(T5) - lxv vs37, 16(T5) - lxv vs38,32(T5) - lxv vs39, 48(T5) #endif - xxmrglw vs8, vs48, vs60 xxmrglw vs10, vs52, vs56 - +#ifndef TRMMKERNEL + lxv vs34, 32(T4) + lxv vs35, 48(T4) +#endif xxmrghw vs1, vs48, vs60 xxmrghw vs0, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs36, 0(T5) + lxv vs37, 16(T5) +#endif xxmrglw vs12, vs49, vs61 xxmrglw vs14, vs53, vs57 - +#ifndef TRMMKERNEL + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 #ifndef TRMMKERNEL lxv vs40, 0(T6) - lxv vs41, 16(T6) - lxv vs42, 32(T6) - lxv vs43, 48(T6) - lxv vs44, 0(T7) - lxv vs45, 16(T7) - lxv vs46, 32(T7) - lxv vs47, 48(T7) + lxv vs41, 16(T6) #endif - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 - xxmrglw vs16, vs50, vs62 xxmrglw vs18, vs54, vs58 - +#ifndef TRMMKERNEL + lxv vs42, 32(T6) + lxv vs43, 48(T6) +#endif xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxmrghw vs4, vs54, vs58 xxmrghw vs5, vs50, vs62 - +#ifndef TRMMKERNEL + lxv vs44, 0(T7) + lxv vs45, 16(T7) +#endif xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxmrglw vs24, vs51, vs63 - xxmrglw vs26, vs55, vs59 - + xxmrglw vs26, vs55, vs59 +#ifndef TRMMKERNEL + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif xxlor vs17, vs16, vs16 xxlor vs19, vs18, vs18 xxmrghw vs30, vs55, vs59 - xxmrghw vs31, vs51, vs63 + xxmrghw vs31, vs51, vs63 + + xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 @@ -965,11 +665,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs27, vs26, vs26 xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 - + #ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif xxperm vs16, vs4, save_permute_1 xxperm vs18, vs5, save_permute_1 + stxv vs32, 0(T4) + stxv vs33, 16(T4) xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 xxperm vs24, vs30, save_permute_1 @@ -977,64 +686,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r +#ifdef TRMMKERNEL xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + stxv vs34, 32(T4) + stxv vs35, 48(T4) + +#ifdef TRMMKERNEL xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T5) + stxv vs37, 16(T5) + +#ifdef TRMMKERNEL xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r +#else xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif - stxv vs32, 0(T4) - stxv vs33, 16(T4) - stxv vs34, 32(T4) - stxv vs35, 48(T4) - stxv vs36, 0(T5) - stxv vs37, 16(T5) + + stxv vs38, 32(T5) stxv vs39, 48(T5) + #ifdef TRMMKERNEL xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - + xvmulsp vs41, vs14, alpha_r +#else xvmaddasp vs40, vs10, alpha_r xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - #endif - stxv vs40, 0(T6) - stxv vs41, 16(T6) + stxv vs41, 16(T6) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif stxv vs42, 32(T6) stxv vs43, 48(T6) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + stxv vs44, 0(T7) stxv vs45, 16(T7) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + stxv vs46, 32(T7) stxv vs47, 48(T7) @@ -1224,12 +946,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 @@ -1247,21 +971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 @@ -1285,21 +1009,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 - + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 @@ -1323,22 +1048,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 - +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.endif xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 - +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.if \Complete==0 lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) +.endif - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask +.if \Complete==0 xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index e655f0bfe..a41bcec77 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -30,10 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD ld -#define STACKSIZE 32192 +#define STACKSIZE 512 #define FZERO 312+192(SP) - + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ #define M r3 #define N r4 @@ -56,20 +57,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FRAMEPOINTER r12 -#define BBUFFER r14 +#define T10 r14 #define L r15 -#define ALPHA r16 +#define T8 r16 #define T5 r17 #define T2 r19 -#define BBO r20 -#define o8 r21 +#define T9 r20 +#define T6 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 -#define o16 r27 +#define T7 r27 #define T3 r28 #define T4 r29 @@ -82,12 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROFCODE mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - addi SP, SP, -STACKSIZE - li r0, 0 - + addi SP, SP, -STACKSIZE + mflr r0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) @@ -111,6 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f30, 128(SP) stfd f31, 136(SP) + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ std r31, 144(SP) std r30, 152(SP) @@ -132,21 +131,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP) - stxv v20, 288(SP) - stxv v21, 304(SP) - stxv v22, 320(SP) - stxv v23, 336(SP) - stxv v24, 352(SP) - stxv v25, 368(SP) - stxv v26, 384(SP) - stxv v27, 400(SP) - stxv v28, 416(SP) - stxv v29, 432(SP) - stxv v30, 448(SP) - stxv v31, 464(SP) - + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + std r0, FLINK_SAVE(SP) - stw r0, FZERO #ifdef linux ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) @@ -162,35 +161,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemm_macros_power9.S" - cmpwi cr0, M, 0 - ble L999 - cmpwi cr0, N, 0 - ble L999 - cmpwi cr0, K, 0 - ble L999 + slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li o8 , 8 - li o16 , 16 - - addi BBUFFER, SP, 512+4096 - li T1, -4096 - and BBUFFER, BBUFFER, T1 - - - addi ALPHA, SP, 296+192 + li PRE, 512 + li r0, 0 - xxlor alpha_r,vs1,vs1 /*copy from register f1 */ - xxlor alpha_i,vs2,vs2 /*copy from register f2 */ +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif .align 4 #include "zgemm_logic_power9.S" L999: - addi r3, 0, 0 - + lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) @@ -233,24 +221,24 @@ L999: ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) - lxv v20, 288(SP) - lxv v21, 304(SP) - lxv v22, 320(SP) - lxv v23, 336(SP) - lxv v24, 352(SP) - lxv v25, 368(SP) - lxv v26, 384(SP) - lxv v27, 400(SP) - lxv v28, 416(SP) - lxv v29, 432(SP) - lxv v30, 448(SP) - lxv v31, 464(SP) - - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE - addi SP, SP, STACKSIZE + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE blr EPILOGUE diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index 77ce36294..01685fe79 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -25,155 +25,348 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define MY_ALIGN .align 3 +b ZGEMM_L2 - srawi. J, N, 1 - ble ZGEMM_L2_END - -ZGEMM_L2_BEGIN: - - mr BO, B - mr BBO, BBUFFER - srawi. T1, K, 2 - ble ZGEMM_L2_COPYB1 - -ZGEMM_L2_COPYB8: - - addi T2, PRE, 128 - dcbt BO, PRE - dcbtst BBO, PRE - dcbtst BBO, T2 - ZCOPYB_8 - addic. T1, T1, -1 - - bgt ZGEMM_L2_COPYB8 - -ZGEMM_L2_COPYB1: - - andi. T1, K, 3 - ble ZGEMM_L2_COPYB_END - -ZGEMM_L2_COPYB_LOOP: - - ZCOPYB_2 - addic. T1, T1, -1 - - bgt ZGEMM_L2_COPYB_LOOP - -ZGEMM_L2_COPYB_END: - - mr CO, C - mr AO, A - slwi T1, LDC , 1 - add C, C, T1 - srawi. I, M, 3 - ble ZGEMM_L2x8_END +/* MINI SUBROUTINES */ -ZGEMM_L2x8_BEGIN: - mr BO, BBUFFER - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 32x */ - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - - -ZGEMM_L2x8_LOOP_START: - - LOAD2x8 0 - li T2, 1024 - li T3, 1024+512 - li T4, 2048 - li T5, 2048+512 +/* 2x8 MAIN 128x+1 LOOP */ +ZGEMM_L2x8_LMAIN_SUB: mtctr L - + LOAD2x8 0 MY_ALIGN ZGEMM_L2x8_LOOP: - dcbt AO, PRE + dcbt AO, PRE dcbt BO, PRE - KERNEL2x8_L 128,64,0,0 - KERNEL2x8_L 128,64,1,0 + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 dcbt AO, T2 - KERNEL2x8_L 128,64,2,0 - KERNEL2x8_L 128,64,3,0 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 dcbt AO, T3 dcbt BO, T2 - KERNEL2x8_L 128,64,4,0 - KERNEL2x8_L 128,64,5,0 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 dcbt AO, T4 - KERNEL2x8_L 128,64,6,0 - KERNEL2x8_L 128,64,7,0 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 dcbt AO, T5 dcbt BO, T3 - KERNEL2x8_L 128,64,8,0 - KERNEL2x8_L 128,64,9,0 - KERNEL2x8_L 128,64,10,0 - KERNEL2x8_L 128,64,11,0 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 dcbt BO, T4 - KERNEL2x8_L 128,64,12,0 - KERNEL2x8_L 128,64,13,0 - KERNEL2x8_L 128,64,14,0 - KERNEL2x8_L 128,64,15,1 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,0 + KERNEL2x8_L 128,32,16,0 + KERNEL2x8_L 128,32,17,0 + KERNEL2x8_L 128,32,18,0 + KERNEL2x8_L 128,32,19,0 + KERNEL2x8_L 128,32,20,0 + KERNEL2x8_L 128,32,21,0 + KERNEL2x8_L 128,32,22,0 + KERNEL2x8_L 128,32,23,0 + KERNEL2x8_L 128,32,24,0 + KERNEL2x8_L 128,32,25,0 + KERNEL2x8_L 128,32,26,0 + KERNEL2x8_L 128,32,27,0 + KERNEL2x8_L 128,32,28,0 + KERNEL2x8_L 128,32,29,0 + KERNEL2x8_L 128,32,30,0 + KERNEL2x8_L 128,32,31,0 + KERNEL2x8_L 128,32,32,0 + KERNEL2x8_L 128,32,33,0 + KERNEL2x8_L 128,32,34,0 + KERNEL2x8_L 128,32,35,0 + KERNEL2x8_L 128,32,36,0 + KERNEL2x8_L 128,32,37,0 + KERNEL2x8_L 128,32,38,0 + KERNEL2x8_L 128,32,39,0 + KERNEL2x8_L 128,32,40,0 + KERNEL2x8_L 128,32,41,0 + KERNEL2x8_L 128,32,42,0 + KERNEL2x8_L 128,32,43,0 + KERNEL2x8_L 128,32,44,0 + KERNEL2x8_L 128,32,45,0 + KERNEL2x8_L 128,32,46,0 + KERNEL2x8_L 128,32,47,0 + KERNEL2x8_L 128,32,48,0 + KERNEL2x8_L 128,32,49,0 + KERNEL2x8_L 128,32,50,0 + KERNEL2x8_L 128,32,51,0 + KERNEL2x8_L 128,32,52,0 + KERNEL2x8_L 128,32,53,0 + KERNEL2x8_L 128,32,54,0 + KERNEL2x8_L 128,32,55,0 + KERNEL2x8_L 128,32,56,0 + KERNEL2x8_L 128,32,57,0 + KERNEL2x8_L 128,32,58,0 + KERNEL2x8_L 128,32,59,0 + KERNEL2x8_L 128,32,60,0 + KERNEL2x8_L 128,32,61,0 + KERNEL2x8_L 128,32,62,0 + KERNEL2x8_L 128,32,63,1 bdnz ZGEMM_L2x8_LOOP MY_ALIGN ZGEMM_L2x8_LOOP_END: - END2x8 AO, BO, 128, 64 - - b ZGEMM_L2x8_SUB1 - -ZGEMM_L2x8_SUB0: + END2x8 AO, BO, 128,32 + blr - andi. L, K, 63 - - b ZGEMM_L2x8_SUB2 + MY_ALIGN +ZGEMM_2x8_L64_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,0 + KERNEL2x8_L 128,32,16,0 + KERNEL2x8_L 128,32,17,0 + KERNEL2x8_L 128,32,18,0 + KERNEL2x8_L 128,32,19,0 + KERNEL2x8_L 128,32,20,0 + KERNEL2x8_L 128,32,21,0 + KERNEL2x8_L 128,32,22,0 + KERNEL2x8_L 128,32,23,0 + KERNEL2x8_L 128,32,24,0 + KERNEL2x8_L 128,32,25,0 + KERNEL2x8_L 128,32,26,0 + KERNEL2x8_L 128,32,27,0 + KERNEL2x8_L 128,32,28,0 + KERNEL2x8_L 128,32,29,0 + KERNEL2x8_L 128,32,30,0 + KERNEL2x8_E 128,32,31,1 + blr -ZGEMM_L2x8_SUB1: - andi. L, T1, 31 - ble ZGEMM_L2x8_SAVE + MY_ALIGN +ZGEMM_2x8_L32_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L 128,32,8,0 + KERNEL2x8_L 128,32,9,0 + KERNEL2x8_L 128,32,10,0 + KERNEL2x8_L 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L 128,32,12,0 + KERNEL2x8_L 128,32,13,0 + KERNEL2x8_L 128,32,14,0 + KERNEL2x8_L 128,32,15,1 + blr + MY_ALIGN + +ZGEMM_2x8_L16_SUB: + LOAD2x8 0 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L 128,32,0,0 + KERNEL2x8_L 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L 128,32,2,0 + KERNEL2x8_L 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L 128,32,4,0 + KERNEL2x8_L 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L 128,32,6,0 + KERNEL2x8_L 128,32,7,1 + blr + MY_ALIGN + +ZGEMM_2x4_LMAIN_SUB: + mtctr L + LOAD2x4 0 + MY_ALIGN +ZGEMM_L2x4_LOOP: + KERNEL2x4_L 64,32,0,0 + KERNEL2x4_L 64,32,1,0 + KERNEL2x4_L 64,32,2,0 + KERNEL2x4_L 64,32,3,0 + KERNEL2x4_L 64,32,4,0 + KERNEL2x4_L 64,32,5,0 + KERNEL2x4_L 64,32,6,0 + KERNEL2x4_L 64,32,7,0 + KERNEL2x4_L 64,32,8,0 + KERNEL2x4_L 64,32,9,0 + KERNEL2x4_L 64,32,10,0 + KERNEL2x4_L 64,32,11,0 + KERNEL2x4_L 64,32,12,0 + KERNEL2x4_L 64,32,13,0 + KERNEL2x4_L 64,32,14,0 + KERNEL2x4_L 64,32,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: + END2x4 AO, BO, 64,32 + blr + + MY_ALIGN +ZGEMM_2x4_L16_SUB: + LOAD2x4 0 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_L 64,32, 1,0 + KERNEL2x4_L 64,32, 2,0 + KERNEL2x4_L 64,32, 3,0 + KERNEL2x4_L 64,32, 4,0 + KERNEL2x4_L 64,32, 5,0 + KERNEL2x4_L 64,32, 6,0 + KERNEL2x4_E 64,32, 7,1 + blr + + MY_ALIGN +ZGEMM_2x4_L8_SUB: + LOAD2x4 0 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_L 64,32, 1,0 + KERNEL2x4_L 64,32, 2,0 + KERNEL2x4_E 64,32, 3,1 + blr +/* MAIN LOOP BEGINS */ + + MY_ALIGN +ZGEMM_L2: + srawi. J, N, 1 + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 + srawi. I, M, 3 + ble ZGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 +ZGEMM_L2x8_BEGIN: + mr T1, K + mr BO, B + dcbt B, r0 + dcbt AO, r0 + /* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + + addi T1,T1, -1 + /* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. L, T1, 7 /**(K-1) % 128x */ + + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB0: + andi. L, K, 255 + cmpwi K,128 + bne ZGEMM_L2x8_SUB2 + MY_ALIGN +ZGEMM_L2x8_SUB2_128: + bl ZGEMM_2x8_L64_SUB + bl ZGEMM_2x8_L64_SUB + b ZGEMM_L2x8_SAVE + MY_ALIGN ZGEMM_L2x8_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x8_SUB2_4 - mtctr T1 + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB MY_ALIGN -ZGEMM_L2x8_SUB2_LOOP: +ZGEMM_L2x8_SUB2_32: + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_16: + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + bl ZGEMM_2x8_L16_SUB + MY_ALIGN +ZGEMM_L2x8_SUB2_8: + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 LOAD2x8 0 - KERNEL2x8_L 128,64, 0,0 - KERNEL2x8_L 128,64, 1,0 - KERNEL2x8_L 128,64, 2,0 - KERNEL2x8_E 128,64, 3,1 - bdnz ZGEMM_L2x8_SUB2_LOOP - MY_ALIGN + KERNEL2x8_L 128,32, 0,0 + KERNEL2x8_L 128,32, 1,0 + KERNEL2x8_L 128,32, 2,0 + KERNEL2x8_E 128,32, 3,1 + MY_ALIGN ZGEMM_L2x8_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x8_SUB2_2 LOAD2x8 0 - KERNEL2x8_L 128,64, 0,0 - KERNEL2x8_E 128,64, 1,1 + KERNEL2x8_L 128,32, 0,0 + KERNEL2x8_E 128,32, 1,1 MY_ALIGN ZGEMM_L2x8_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x8_SUB2_1 LOAD2x8 0 - KERNEL2x8_E 128,64, 0,1 + KERNEL2x8_E 128,32, 0,1 MY_ALIGN ZGEMM_L2x8_SUB2_1: andi. T1,L, 1 ble ZGEMM_L2x8_SAVE - KERNEL2x8 - -/* addic. L, L, -1 - bgt ZGEMM_L2x8_SUB2_1*/ + KERNEL2x8 ZGEMM_L2x8_SAVE: - + addic. I, I, -1 SAVE2x8 - addic. I, I, -1 bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN ZGEMM_L2x8_END: ZGEMM_L2x4_BEGIN: @@ -183,70 +376,50 @@ ZGEMM_L2x4_BEGIN: andi. T1, M, 4 ble ZGEMM_L2x4_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x4 - ble ZGEMM_L2x4_SUB0 - -ZGEMM_L2x4_LOOP_START: - LOAD2x4 0 - mtctr L + ZERO2x4 + srawi. L, T1, 5 /**(K-1) % 32x */ - MY_ALIGN -ZGEMM_L2x4_LOOP: - KERNEL2x4_L 64,64,0,0 - KERNEL2x4_L 64,64,1,0 - KERNEL2x4_L 64,64,2,0 - KERNEL2x4_L 64,64,3,0 - KERNEL2x4_L 64,64,4,0 - KERNEL2x4_L 64,64,5,0 - KERNEL2x4_L 64,64,6,0 - KERNEL2x4_L 64,64,7,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: - END2x4 AO, BO, 64, 64 - - b ZGEMM_L2x4_SUB1 - -ZGEMM_L2x4_SUB0: - - andi. L, K, 31 - - b ZGEMM_L2x4_SUB2 - -ZGEMM_L2x4_SUB1: - - andi. L, T1, 15 + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 -ZGEMM_L2x4_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x4_SUB2_4 - mtctr T1 +ZGEMM_L2x4_SUB0: + andi. L, K, 63 + cmpwi K,32 + bne ZGEMM_L2x4_SUB2 + MY_ALIGN +ZGEMM_L2x4_SUB2_32: + bl ZGEMM_2x4_L16_SUB + bl ZGEMM_2x4_L16_SUB + b ZGEMM_L2x4_SAVE + MY_ALIGN +ZGEMM_L2x4_SUB2: + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + bl ZGEMM_2x4_L16_SUB MY_ALIGN -ZGEMM_L2x4_SUB2_LOOP: - LOAD2x4 0 - KERNEL2x4_L 64,64, 0,0 - KERNEL2x4_L 64,64, 1,0 - KERNEL2x4_L 64,64, 2,0 - KERNEL2x4_E 64,64, 3,1 - bdnz ZGEMM_L2x4_SUB2_LOOP +ZGEMM_L2x4_SUB2_8: + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + bl ZGEMM_2x4_L8_SUB MY_ALIGN ZGEMM_L2x4_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x4_SUB2_2 LOAD2x4 0 - KERNEL2x4_L 64,64, 0,0 - KERNEL2x4_E 64,64, 1,1 + KERNEL2x4_L 64,32, 0,0 + KERNEL2x4_E 64,32, 1,1 MY_ALIGN ZGEMM_L2x4_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x4_SUB2_1 LOAD2x4 0 - KERNEL2x4_E 64,64, 0,1 + KERNEL2x4_E 64,32, 0,1 MY_ALIGN ZGEMM_L2x4_SUB2_1: andi. T1,L, 1 @@ -259,12 +432,11 @@ ZGEMM_L2x4_SAVE: ZGEMM_L2x4_END: -ZGEMM_L2x2_BEGIN: - +ZGEMM_L2x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L2x2_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 4 /**(K-1) % 16x */ @@ -277,18 +449,18 @@ ZGEMM_L2x2_LOOP_START: MY_ALIGN ZGEMM_L2x2_LOOP: - KERNEL2x2_L 32,64,0,0 - KERNEL2x2_L 32,64,1,0 - KERNEL2x2_L 32,64,2,0 - KERNEL2x2_L 32,64,3,0 - KERNEL2x2_L 32,64,4,0 - KERNEL2x2_L 32,64,5,0 - KERNEL2x2_L 32,64,6,0 - KERNEL2x2_L 32,64,7,1 + KERNEL2x2_L 32,32,0,0 + KERNEL2x2_L 32,32,1,0 + KERNEL2x2_L 32,32,2,0 + KERNEL2x2_L 32,32,3,0 + KERNEL2x2_L 32,32,4,0 + KERNEL2x2_L 32,32,5,0 + KERNEL2x2_L 32,32,6,0 + KERNEL2x2_L 32,32,7,1 bdnz ZGEMM_L2x2_LOOP MY_ALIGN ZGEMM_L2x2_LOOP_END: - END2x2 AO, BO, 32, 64 + END2x2 AO, BO, 32,32 b ZGEMM_L2x2_SUB1 @@ -310,24 +482,24 @@ ZGEMM_L2x2_SUB2: MY_ALIGN ZGEMM_L2x2_SUB2_LOOP: LOAD2x2 0 - KERNEL2x2_L 32,64, 0,0 - KERNEL2x2_L 32,64, 1,0 - KERNEL2x2_L 32,64, 2,0 - KERNEL2x2_E 32,64, 3,1 + KERNEL2x2_L 32,32, 0,0 + KERNEL2x2_L 32,32, 1,0 + KERNEL2x2_L 32,32, 2,0 + KERNEL2x2_E 32,32, 3,1 bdnz ZGEMM_L2x2_SUB2_LOOP MY_ALIGN ZGEMM_L2x2_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x2_SUB2_2 LOAD2x2 0 - KERNEL2x2_L 32,64, 0,0 - KERNEL2x2_E 32,64, 1,1 + KERNEL2x2_L 32,32, 0,0 + KERNEL2x2_E 32,32, 1,1 MY_ALIGN ZGEMM_L2x2_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x2_SUB2_1 LOAD2x2 0 - KERNEL2x2_E 32,64, 0,1 + KERNEL2x2_E 32,32, 0,1 MY_ALIGN ZGEMM_L2x2_SUB2_1: andi. T1,L, 1 @@ -339,12 +511,12 @@ ZGEMM_L2x2_SAVE: ZGEMM_L2x2_END: -ZGEMM_L2x1_BEGIN: +ZGEMM_L2x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L2x1_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 4 /**(K-1) % 16x */ @@ -358,18 +530,18 @@ ZGEMM_L2x1_LOOP_START: MY_ALIGN ZGEMM_L2x1_LOOP: - KERNEL2x1_L 16,64,0,0 - KERNEL2x1_L 16,64,1,0 - KERNEL2x1_L 16,64,2,0 - KERNEL2x1_L 16,64,3,0 - KERNEL2x1_L 16,64,4,0 - KERNEL2x1_L 16,64,5,0 - KERNEL2x1_L 16,64,6,0 - KERNEL2x1_L 16,64,7,1 + KERNEL2x1_L 16,32,0,0 + KERNEL2x1_L 16,32,1,0 + KERNEL2x1_L 16,32,2,0 + KERNEL2x1_L 16,32,3,0 + KERNEL2x1_L 16,32,4,0 + KERNEL2x1_L 16,32,5,0 + KERNEL2x1_L 16,32,6,0 + KERNEL2x1_L 16,32,7,1 bdnz ZGEMM_L2x1_LOOP MY_ALIGN ZGEMM_L2x1_LOOP_END: - END2x1 AO, BO, 16, 64 + END2x1 AO, BO, 16,32 b ZGEMM_L2x1_SUB1 @@ -391,24 +563,24 @@ ZGEMM_L2x1_SUB2: MY_ALIGN ZGEMM_L2x1_SUB2_LOOP: LOAD2x1 0 - KERNEL2x1_L 16,64, 0,0 - KERNEL2x1_L 16,64, 1,0 - KERNEL2x1_L 16,64, 2,0 - KERNEL2x1_E 16,64, 3,1 + KERNEL2x1_L 16,32, 0,0 + KERNEL2x1_L 16,32, 1,0 + KERNEL2x1_L 16,32, 2,0 + KERNEL2x1_E 16,32, 3,1 bdnz ZGEMM_L2x1_SUB2_LOOP MY_ALIGN ZGEMM_L2x1_SUB2_4: andi. T1,L, 4 ble ZGEMM_L2x1_SUB2_2 LOAD2x1 0 - KERNEL2x1_L 16,64, 0,0 - KERNEL2x1_E 16,64, 1,1 + KERNEL2x1_L 16,32, 0,0 + KERNEL2x1_E 16,32, 1,1 MY_ALIGN ZGEMM_L2x1_SUB2_2: andi. T1,L, 2 ble ZGEMM_L2x1_SUB2_1 LOAD2x1 0 - KERNEL2x1_E 16,64, 0,1 + KERNEL2x1_E 16,32, 0,1 MY_ALIGN ZGEMM_L2x1_SUB2_1: andi. T1,L, 1 @@ -442,36 +614,6 @@ ZGEMM_L1_BEGIN: andi. T1, N, 1 ble ZGEMM_L1_END - mr BO, B - mr BBO, BBUFFER - srawi. T1, K, 3 /*this time K/8 */ - ble ZGEMM_L1_COPYB1 - -ZGEMM_L1_COPYB8: - - addi T2, PRE, 128 - dcbt BO, PRE - dcbtst BBO, PRE - dcbtst BBO, T2 - ZCOPYB_8 - addic. T1, T1, -1 - - bgt ZGEMM_L1_COPYB8 - -ZGEMM_L1_COPYB1: - - andi. T1, K, 7 - ble ZGEMM_L1_COPYB_END - -ZGEMM_L1_COPYB_LOOP: - - ZCOPYB_1 - addic. T1, T1, -1 - - bgt ZGEMM_L1_COPYB_LOOP - -ZGEMM_L1_COPYB_END: - mr CO, C mr AO, A srawi. I, M, 3 @@ -480,7 +622,7 @@ ZGEMM_L1_COPYB_END: ZGEMM_L1x8_BEGIN: - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 32x */ @@ -501,33 +643,33 @@ ZGEMM_L1x8_LOOP_START: ZGEMM_L1x8_LOOP: dcbt AO, PRE dcbt BO, PRE - KERNEL1x8_L 128,32,0,0 - KERNEL1x8_L 128,32,1,0 + KERNEL1x8_L 128,16,0,0 + KERNEL1x8_L 128,16,1,0 dcbt AO, T2 - KERNEL1x8_L 128,32,2,0 - KERNEL1x8_L 128,32,3,0 + KERNEL1x8_L 128,16,2,0 + KERNEL1x8_L 128,16,3,0 dcbt AO, T3 dcbt BO, T2 - KERNEL1x8_L 128,32,4,0 - KERNEL1x8_L 128,32,5,0 + KERNEL1x8_L 128,16,4,0 + KERNEL1x8_L 128,16,5,0 dcbt AO, T4 - KERNEL1x8_L 128,32,6,0 - KERNEL1x8_L 128,32,7,0 + KERNEL1x8_L 128,16,6,0 + KERNEL1x8_L 128,16,7,0 dcbt AO, T5 dcbt BO, T3 - KERNEL1x8_L 128,32,8,0 - KERNEL1x8_L 128,32,9,0 - KERNEL1x8_L 128,32,10,0 - KERNEL1x8_L 128,32,11,0 + KERNEL1x8_L 128,16,8,0 + KERNEL1x8_L 128,16,9,0 + KERNEL1x8_L 128,16,10,0 + KERNEL1x8_L 128,16,11,0 dcbt BO, T4 - KERNEL1x8_L 128,32,12,0 - KERNEL1x8_L 128,32,13,0 - KERNEL1x8_L 128,32,14,0 - KERNEL1x8_L 128,32,15,1 + KERNEL1x8_L 128,16,12,0 + KERNEL1x8_L 128,16,13,0 + KERNEL1x8_L 128,16,14,0 + KERNEL1x8_L 128,16,15,1 bdnz ZGEMM_L1x8_LOOP MY_ALIGN ZGEMM_L1x8_LOOP_END: - END1x8 AO, BO, 128, 32 + END1x8 AO, BO, 128,16 b ZGEMM_L1x8_SUB1 @@ -549,32 +691,30 @@ ZGEMM_L1x8_SUB2: MY_ALIGN ZGEMM_L1x8_SUB2_LOOP: LOAD1x8 0 - KERNEL1x8_L 128,32, 0,0 - KERNEL1x8_L 128,32, 1,0 - KERNEL1x8_L 128,32, 2,0 - KERNEL1x8_E 128,32, 3,1 + KERNEL1x8_L 128,16, 0,0 + KERNEL1x8_L 128,16, 1,0 + KERNEL1x8_L 128,16, 2,0 + KERNEL1x8_E 128,16, 3,1 bdnz ZGEMM_L1x8_SUB2_LOOP MY_ALIGN ZGEMM_L1x8_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x8_SUB2_2 LOAD1x8 0 - KERNEL1x8_L 128,32, 0,0 - KERNEL1x8_E 128,32, 1,1 + KERNEL1x8_L 128,16, 0,0 + KERNEL1x8_E 128,16, 1,1 MY_ALIGN ZGEMM_L1x8_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x8_SUB2_1 LOAD1x8 0 - KERNEL1x8_E 128,32, 0,1 + KERNEL1x8_E 128,16, 0,1 MY_ALIGN ZGEMM_L1x8_SUB2_1: andi. T1,L, 1 ble ZGEMM_L1x8_SAVE KERNEL1x8 - -/* addic. L, L, -1 - bgt ZGEMM_L1x8_SUB2_1*/ + ZGEMM_L1x8_SAVE: @@ -592,7 +732,7 @@ ZGEMM_L1x4_BEGIN: andi. T1, M, 4 ble ZGEMM_L1x4_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -605,26 +745,26 @@ ZGEMM_L1x4_LOOP_START: MY_ALIGN ZGEMM_L1x4_LOOP: - KERNEL1x4_L 64,32,0,0 - KERNEL1x4_L 64,32,1,0 - KERNEL1x4_L 64,32,2,0 - KERNEL1x4_L 64,32,3,0 - KERNEL1x4_L 64,32,4,0 - KERNEL1x4_L 64,32,5,0 - KERNEL1x4_L 64,32,6,0 - KERNEL1x4_L 64,32,7,0 - KERNEL1x4_L 64,32,8,0 - KERNEL1x4_L 64,32,9,0 - KERNEL1x4_L 64,32,10,0 - KERNEL1x4_L 64,32,11,0 - KERNEL1x4_L 64,32,12,0 - KERNEL1x4_L 64,32,13,0 - KERNEL1x4_L 64,32,14,0 - KERNEL1x4_L 64,32,15,1 + KERNEL1x4_L 64,16,0,0 + KERNEL1x4_L 64,16,1,0 + KERNEL1x4_L 64,16,2,0 + KERNEL1x4_L 64,16,3,0 + KERNEL1x4_L 64,16,4,0 + KERNEL1x4_L 64,16,5,0 + KERNEL1x4_L 64,16,6,0 + KERNEL1x4_L 64,16,7,0 + KERNEL1x4_L 64,16,8,0 + KERNEL1x4_L 64,16,9,0 + KERNEL1x4_L 64,16,10,0 + KERNEL1x4_L 64,16,11,0 + KERNEL1x4_L 64,16,12,0 + KERNEL1x4_L 64,16,13,0 + KERNEL1x4_L 64,16,14,0 + KERNEL1x4_L 64,16,15,1 bdnz ZGEMM_L1x4_LOOP MY_ALIGN ZGEMM_L1x4_LOOP_END: - END1x4 AO, BO, 64, 32 + END1x4 AO, BO, 64,16 b ZGEMM_L1x4_SUB1 @@ -646,24 +786,24 @@ ZGEMM_L1x4_SUB2: MY_ALIGN ZGEMM_L1x4_SUB2_LOOP: LOAD1x4 0 - KERNEL1x4_L 64,32, 0,0 - KERNEL1x4_L 64,32, 1,0 - KERNEL1x4_L 64,32, 2,0 - KERNEL1x4_E 64,32, 3,1 + KERNEL1x4_L 64,16, 0,0 + KERNEL1x4_L 64,16, 1,0 + KERNEL1x4_L 64,16, 2,0 + KERNEL1x4_E 64,16, 3,1 bdnz ZGEMM_L1x4_SUB2_LOOP MY_ALIGN ZGEMM_L1x4_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x4_SUB2_2 LOAD1x4 0 - KERNEL1x4_L 64,32, 0,0 - KERNEL1x4_E 64,32, 1,1 + KERNEL1x4_L 64,16, 0,0 + KERNEL1x4_E 64,16, 1,1 MY_ALIGN ZGEMM_L1x4_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x4_SUB2_1 LOAD1x4 0 - KERNEL1x4_E 64,32, 0,1 + KERNEL1x4_E 64,16, 0,1 MY_ALIGN ZGEMM_L1x4_SUB2_1: andi. T1,L, 1 @@ -681,7 +821,7 @@ ZGEMM_L1x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L1x2_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -694,26 +834,26 @@ ZGEMM_L1x2_LOOP_START: MY_ALIGN ZGEMM_L1x2_LOOP: - KERNEL1x2_L 32,32,0,0 - KERNEL1x2_L 32,32,1,0 - KERNEL1x2_L 32,32,2,0 - KERNEL1x2_L 32,32,3,0 - KERNEL1x2_L 32,32,4,0 - KERNEL1x2_L 32,32,5,0 - KERNEL1x2_L 32,32,6,0 - KERNEL1x2_L 32,32,7,0 - KERNEL1x2_L 32,32,8,0 - KERNEL1x2_L 32,32,9,0 - KERNEL1x2_L 32,32,10,0 - KERNEL1x2_L 32,32,11,0 - KERNEL1x2_L 32,32,12,0 - KERNEL1x2_L 32,32,13,0 - KERNEL1x2_L 32,32,14,0 - KERNEL1x2_L 32,32,15,1 + KERNEL1x2_L 32,16,0,0 + KERNEL1x2_L 32,16,1,0 + KERNEL1x2_L 32,16,2,0 + KERNEL1x2_L 32,16,3,0 + KERNEL1x2_L 32,16,4,0 + KERNEL1x2_L 32,16,5,0 + KERNEL1x2_L 32,16,6,0 + KERNEL1x2_L 32,16,7,0 + KERNEL1x2_L 32,16,8,0 + KERNEL1x2_L 32,16,9,0 + KERNEL1x2_L 32,16,10,0 + KERNEL1x2_L 32,16,11,0 + KERNEL1x2_L 32,16,12,0 + KERNEL1x2_L 32,16,13,0 + KERNEL1x2_L 32,16,14,0 + KERNEL1x2_L 32,16,15,1 bdnz ZGEMM_L1x2_LOOP MY_ALIGN ZGEMM_L1x2_LOOP_END: - END1x2 AO, BO, 32, 32 + END1x2 AO, BO, 32,16 b ZGEMM_L1x2_SUB1 @@ -735,24 +875,24 @@ ZGEMM_L1x2_SUB2: MY_ALIGN ZGEMM_L1x2_SUB2_LOOP: LOAD1x2 0 - KERNEL1x2_L 32,32, 0,0 - KERNEL1x2_L 32,32, 1,0 - KERNEL1x2_L 32,32, 2,0 - KERNEL1x2_E 32,32, 3,1 + KERNEL1x2_L 32,16, 0,0 + KERNEL1x2_L 32,16, 1,0 + KERNEL1x2_L 32,16, 2,0 + KERNEL1x2_E 32,16, 3,1 bdnz ZGEMM_L1x2_SUB2_LOOP MY_ALIGN ZGEMM_L1x2_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x2_SUB2_2 LOAD1x2 0 - KERNEL1x2_L 32,32, 0,0 - KERNEL1x2_E 32,32, 1,1 + KERNEL1x2_L 32,16, 0,0 + KERNEL1x2_E 32,16, 1,1 MY_ALIGN ZGEMM_L1x2_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x2_SUB2_1 LOAD1x2 0 - KERNEL1x2_E 32,32, 0,1 + KERNEL1x2_E 32,16, 0,1 MY_ALIGN ZGEMM_L1x2_SUB2_1: andi. T1,L, 1 @@ -769,7 +909,7 @@ ZGEMM_L1x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L1x1_END - mr BO, BBUFFER + mr BO, B mr T1, K addi T1,T1, -1 srawi. L, T1, 5 /**(K-1) % 16x */ @@ -783,26 +923,26 @@ ZGEMM_L1x1_LOOP_START: MY_ALIGN ZGEMM_L1x1_LOOP: - KERNEL1x1_L 16,32,0,0 - KERNEL1x1_L 16,32,1,0 - KERNEL1x1_L 16,32,2,0 - KERNEL1x1_L 16,32,3,0 - KERNEL1x1_L 16,32,4,0 - KERNEL1x1_L 16,32,5,0 - KERNEL1x1_L 16,32,6,0 - KERNEL1x1_L 16,32,7,0 - KERNEL1x1_L 16,32,8,0 - KERNEL1x1_L 16,32,9,0 - KERNEL1x1_L 16,32,10,0 - KERNEL1x1_L 16,32,11,0 - KERNEL1x1_L 16,32,12,0 - KERNEL1x1_L 16,32,13,0 - KERNEL1x1_L 16,32,14,0 - KERNEL1x1_L 16,32,15,1 + KERNEL1x1_L 16,16,0,0 + KERNEL1x1_L 16,16,1,0 + KERNEL1x1_L 16,16,2,0 + KERNEL1x1_L 16,16,3,0 + KERNEL1x1_L 16,16,4,0 + KERNEL1x1_L 16,16,5,0 + KERNEL1x1_L 16,16,6,0 + KERNEL1x1_L 16,16,7,0 + KERNEL1x1_L 16,16,8,0 + KERNEL1x1_L 16,16,9,0 + KERNEL1x1_L 16,16,10,0 + KERNEL1x1_L 16,16,11,0 + KERNEL1x1_L 16,16,12,0 + KERNEL1x1_L 16,16,13,0 + KERNEL1x1_L 16,16,14,0 + KERNEL1x1_L 16,16,15,1 bdnz ZGEMM_L1x1_LOOP MY_ALIGN ZGEMM_L1x1_LOOP_END: - END1x1 AO, BO, 16, 32 + END1x1 AO, BO, 16, 16 b ZGEMM_L1x1_SUB1 @@ -824,24 +964,24 @@ ZGEMM_L1x1_SUB2: MY_ALIGN ZGEMM_L1x1_SUB2_LOOP: LOAD1x1 0 - KERNEL1x1_L 16,32, 0,0 - KERNEL1x1_L 16,32, 1,0 - KERNEL1x1_L 16,32, 2,0 - KERNEL1x1_E 16,32, 3,1 + KERNEL1x1_L 16,16, 0,0 + KERNEL1x1_L 16,16, 1,0 + KERNEL1x1_L 16,16, 2,0 + KERNEL1x1_E 16,16, 3,1 bdnz ZGEMM_L1x1_SUB2_LOOP MY_ALIGN ZGEMM_L1x1_SUB2_4: andi. T1,L, 4 ble ZGEMM_L1x1_SUB2_2 LOAD1x1 0 - KERNEL1x1_L 16,32, 0,0 - KERNEL1x1_E 16,32, 1,1 + KERNEL1x1_L 16,16, 0,0 + KERNEL1x1_E 16,16, 1,1 MY_ALIGN ZGEMM_L1x1_SUB2_2: andi. T1,L, 2 ble ZGEMM_L1x1_SUB2_1 LOAD1x1 0 - KERNEL1x1_E 16,32, 0,1 + KERNEL1x1_E 16,16, 0,1 MY_ALIGN ZGEMM_L1x1_SUB2_1: andi. T1,L, 1 diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 93a309ad1..10d9e4cc3 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -25,68 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xssubdp - #define XSFADD_I1 xsadddp - #define XSFADD_I2 xsadddp - -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xsadddp - #define XSFADD_I1 xssubdp - #define XSFADD_I2 xsadddp - -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xsadddp - #define XSFADD_I1 xsadddp - #define XSFADD_I2 xssubdp - -#else // CC || CR || RC || RR - - #define XSFADD_R1 xsadddp - #define XSFADD_R2 xssubdp - #define XSFADD_I1 xssubdp - #define XSFADD_I2 xssubdp - -#endif - -.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V - AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7 -.endm - -.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8 - xxlxor \TEMP1, \TEMP1, \TEMP1 - xxlxor \TEMP2, \TEMP2, \TEMP2 - - xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB - - XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB - XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB - - xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB - xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB - - XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB - XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB - - xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i - xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r - xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r - xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i - - xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i - xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r - xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part -.endm - -/********************************************************************************************** -* Macros for N=2 and M=8 -**********************************************************************************************/ #define unit_size 16 #define DISP32(ind,disp) (ind*unit_size*32+disp) @@ -95,735 +33,770 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DISP4(ind,disp) (ind*unit_size*4+disp) #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +/* HELPERS FOR SAVE */ + +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm + +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm + +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm + +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm + +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs6,vs8,vs16,vs17 + MULT_APLHA_PART2 vs2,vs4,vs14,vs15 + AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + MULT_APLHA_PART1 vs10,vs12, vs24,vs25 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + MULT_APLHA_PART2 vs10,vs12,vs24,vs25 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 + UNPACK_FOR_STORE vs24,vs25,vs10,vs12 + UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART1 vs6,vs8, vs16,vs17 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 +#ifndef TRMMKERNEL + lxv vs18, (\LOFFSET)(\BASE_REG) + xxmrgld vs14,vs18,vs18 + xxmrghd vs15,vs18,vs18 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + xxmrghd vs7,vs15,vs14 + stxv vs7, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ .macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 .endm .macro LOAD2x8 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A .if \Zero==1 - Zero2x8 + Zero2x8 .endif .endm .macro END2x8_NORMAL - END2x8 AO,BO,128,64 + END2x8 AO,BO,128,32 .endm -.macro END2x8 AREG, BREG, OffsetA, OffsetB +.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag - - xvmaddadp vs48, vs0, vs18 // real*real, imag*real - xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs50, vs1, vs18 // real*real, imag*real - xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs52, vs2, vs18 // real*real, imag*real - xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs54, vs3, vs18 // real*real, imag*real - xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - xvmaddadp vs56, vs4, vs18 // real*real, imag*real - xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag - xvmaddadp vs58, vs5, vs18 // real*real, imag*real - xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - xvmaddadp vs60, vs6, vs18 // real*real, imag*real - xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag - xvmaddadp vs62, vs7, vs18 // real*real, imag*real - xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag - -.endm - -.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag - - xvmaddadp vs48, vs0, vs18 // real*real, imag*real - xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs50, vs1, vs18 // real*real, imag*real - xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs52, vs2, vs18 // real*real, imag*real - xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs54, vs3, vs18 // real*real, imag*real - xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - xvmaddadp vs56, vs4, vs18 // real*real, imag*real - xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag - xvmaddadp vs58, vs5, vs18 // real*real, imag*real - xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - xvmaddadp vs60, vs6, vs18 // real*real, imag*real - xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag - xvmaddadp vs62, vs7, vs18 // real*real, imag*real - xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag - -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP8(\Index,128) -.endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs12, vs20 // real*real, imag*real - xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag - xvmaddadp vs42, vs13, vs20 // real*real, imag*real - xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - xvmaddadp vs44, vs14, vs20 // real*real, imag*real - xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag - xvmaddadp vs46, vs15, vs20 // real*real, imag*real - xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag - - xvmaddadp vs48, vs8, vs22 // real*real, imag*real - xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs50, vs9, vs22 // real*real, imag*real - xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs52, vs10, vs22 // real*real, imag*real - xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs54, vs11, vs22 // real*real, imag*real - xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag - xvmaddadp vs56, vs12, vs22 // real*real, imag*real - xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag - xvmaddadp vs58, vs13, vs22 // real*real, imag*real - xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag - xvmaddadp vs60, vs14, vs22 // real*real, imag*real - xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag - xvmaddadp vs62, vs15, vs22 // real*real, imag*real - xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag - -.endm - -.macro KERNEL2x8 - LOAD2x8 0 - END2x8 AO, BO, 128,64 -.endm + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 -.macro SAVE2x8 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 - mr T1, CO - addi T2, T1, 64 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 -#ifndef TRMMKERNEL + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 -#endif + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 -#endif + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) -#endif + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 - AGGREGATE_INTO_COMPLEX vs48,vs49,vs8 - AGGREGATE_INTO_COMPLEX vs50,vs51,vs9 - AGGREGATE_INTO_COMPLEX vs52,vs53,vs10 - AGGREGATE_INTO_COMPLEX vs54,vs55,vs11 - AGGREGATE_INTO_COMPLEX vs56,vs57,vs12 - AGGREGATE_INTO_COMPLEX vs58,vs59,vs13 - AGGREGATE_INTO_COMPLEX vs60,vs61,vs14 - AGGREGATE_INTO_COMPLEX vs62,vs63,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 -#endif + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) - - addi CO, CO, 128 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 .endm -/********************************************************************************************** -* Macros for N=2 and M=4 -**********************************************************************************************/ +.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 +.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro LOAD2x4 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - -.if \Zero==1 - Zero2x4 -.endif + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 -.endm + xxswapd vs21, vs20 + xxswapd vs23, vs22 -.macro END2x4_NORMAL - END2x4 AO,BO,64,64 -.endm + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 -.macro END2x4 AREG, BREG, OffsetA, OffsetB + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag - -.endm - -.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 .if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A .endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP8(\Index,128) -.endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - - xvmaddadp vs40, vs8, vs22 // real*real, imag*real - xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs42, vs9, vs22 // real*real, imag*real - xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs44, vs10, vs22 // real*real, imag*real - xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs46, vs11, vs22 // real*real, imag*real - xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag -.endm + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 -.macro KERNEL2x4 - LOAD2x4 0 - END2x4 AO, BO, 64,64 -.endm +.if \Complete==0 + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 -.macro SAVE2x4 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 - mr T1, CO + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 -#ifndef TRMMKERNEL +.if \Complete==0 + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 -#endif + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 + +.if \Complete==0 + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A +.endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs48, vs8, vs22 +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif -#ifndef TRMMKERNEL +.endif + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs49, vs8, vs23 - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif -#endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs50, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs51, vs9, vs23 - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs52, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs53, vs10, vs23 - add T1, T1, LDC + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs54, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs55, vs11, vs23 -#ifndef TRMMKERNEL + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs56, vs12, vs22 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs57, vs12, vs23 - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs58, vs13, vs22 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs59, vs13, vs23 -#endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs60, vs14, vs22 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs61, vs14, vs23 - AGGREGATE_INTO_COMPLEX vs40,vs41,vs8 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs9 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs10 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs11 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs62, vs15, vs22 + xvmaddadp vs47, vs15, vs21 + xvmaddadp vs63, vs15, vs23 -#ifndef TRMMKERNEL +.endm - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 +.macro KERNEL2x8 + LOAD2x8 0 + END2x8 AO, BO, 128,32 +.endm -#endif +.macro SAVE2x8 - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - addi CO, CO, 64 + add T1, CO ,LDC + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 + addi CO, CO, 128 .endm /********************************************************************************************** -* Macros for N=2 and M=2 +* Macros for N=2 and M=4 **********************************************************************************************/ -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 .endm -.macro LOAD2x2 Zero +.macro LOAD2x4 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - .if \Zero==1 - Zero2x2 + Zero2x4 .endif .endm -.macro END2x2_NORMAL - END2x2 AO,BO,32,64 +.macro END2x4_NORMAL + END2x4 AO,BO,64,32 .endm -.macro END2x2 AREG, BREG, OffsetA, OffsetB +.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 - xvmaddadp vs36, vs0, vs18 // real*real, imag*real - xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs38, vs1, vs18 // real*real, imag*real - xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag - -.endm + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 -.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 -.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B +.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs36, vs0, vs18 // real*real, imag*real - xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs38, vs1, vs18 // real*real, imag*real - xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) +.endif +.endif + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 .if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 + + +.if \Complete==0 + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP8(\Index,128) + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif .endif -.endif - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif - xvmaddadp vs36, vs8, vs22 // real*real, imag*real - xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs38, vs9, vs22 // real*real, imag*real - xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag - + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs41, vs8, vs23 + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs43, vs9, vs23 + + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs45, vs10, vs23 + + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs47, vs11, vs23 + .endm -.macro KERNEL2x2 - LOAD2x2 0 - END2x2 AO, BO, 32,64 +.macro KERNEL2x4 + LOAD2x4 0 + END2x4 AO, BO, 64,32 .endm -.macro SAVE2x2 +.macro SAVE2x4 + add T1, CO ,LDC + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 + addi CO, CO, 64 - mr T1, CO +.endm -#ifndef TRMMKERNEL +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ - lxv vs16, 0(T1) - lxv vs17, 16(T1) +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm -#endif +.macro LOAD2x2 Zero - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 -#ifndef TRMMKERNEL + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 -#endif +.if \Zero==1 + Zero2x2 +.endif +.endm - stxv vs8, 0(T1) - stxv vs9, 16(T1) +.macro END2x2_NORMAL + END2x2 AO,BO,32,32 +.endm - add T1, T1, LDC +.macro END2x2 AREG, BREG, OffsetA, OffsetB -#ifndef TRMMKERNEL +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif - lxv vs16, 0(T1) - lxv vs17, 16(T1) + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs37, vs0, vs19 -#endif + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs8 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs9 +.endm -#ifndef TRMMKERNEL +.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 +.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm -#endif +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - stxv vs8, 0(T1) - stxv vs9, 16(T1) - - addi CO, CO, 32 + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs37, vs0, vs19 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs39, vs1, vs19 + +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs37, vs8, vs23 + + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs39, vs9, vs23 + +.endm + +.macro KERNEL2x2 + LOAD2x2 0 + END2x2 AO, BO, 32,32 +.endm + +.macro SAVE2x2 + add T1, CO ,LDC + SAVE2 vs32,vs33,vs34,vs35,CO,0 + SAVE2 vs36,vs37,vs38,vs39,T1,0 + addi CO, CO, 32 .endm /********************************************************************************************** @@ -831,348 +804,288 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 .endm .macro LOAD2x1 Zero - lxv vs0, 0(AO) // load real,imag from A + lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - lxv vs18, 32(BO) // load real part from B - lxv vs19, 48(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + lxv vs18, 16(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 .if \Zero==1 - Zero2x1 -.endif - + Zero2x1 +.endif .endm .macro END2x1_NORMAL - END2x1 AO,BO,16,64 + END2x1 AO,BO,16,32 .endm -.macro END2x1 AREG, BREG, OffsetA, OffsetB +.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 - xvmaddadp vs34, vs0, vs18 // real*real, imag*real - xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag - .endm -.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B - lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B - lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B + xxswapd vs21, vs20 + xxswapd vs23, vs22 +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) +.endif +.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 - xvmaddadp vs34, vs0, vs18 // real*real, imag*real - xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs35, vs0, vs19 .if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B - lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B - lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B .endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,64+\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP8(\Index,128) +.if \Complete==0 + lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.if \IsLast==1 + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif .endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs8, vs22 // real*real, imag*real - xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag - +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs35, vs8, vs23 + .endm -.macro KERNEL2x1 +.macro KERNEL2x1 LOAD2x1 0 - END2x1 AO, BO, 16,64 + END2x1 AO, BO, 16,32 .endm .macro SAVE2x1 - - mr T1, CO -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - - AGGREGATE_INTO_COMPLEX vs34,vs35,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - - addi CO, CO, 16 - + add T1, CO ,LDC + SAVE1 vs32,vs33,CO,0 + SAVE1 vs34,vs35,T1,0 + addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 .endm .macro LOAD1x8 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A + lxv vs4, 64(AO) // load real,imag from A + lxv vs5, 80(AO) // load real,imag from A + lxv vs6, 96(AO) // load real,imag from A + lxv vs7, 112(AO) // load real,imag from A .if \Zero==1 - Zero1x8 + Zero1x8 .endif .endm .macro END1x8_NORMAL - END1x8 AO,BO,128,32 + END1x8 AO,BO,128,16 .endm -.macro END1x8 AREG, BREG, OffsetA, OffsetB +.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag - -.endm - -.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B - - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - xvmaddadp vs40, vs4, vs16 // real*real, imag*real - xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag - xvmaddadp vs42, vs5, vs16 // real*real, imag*real - xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - xvmaddadp vs44, vs6, vs16 // real*real, imag*real - xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - xvmaddadp vs46, vs7, vs16 // real*real, imag*real - xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag - -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - xvmaddadp vs40, vs12, vs20 // real*real, imag*real - xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag - xvmaddadp vs42, vs13, vs20 // real*real, imag*real - xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - xvmaddadp vs44, vs14, vs20 // real*real, imag*real - xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag - xvmaddadp vs46, vs15, vs20 // real*real, imag*real - xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag - -.endm - -.macro KERNEL1x8 - LOAD1x8 0 - END1x8 AO, BO, 128,32 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + .endm -.macro SAVE1x8 +.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm - mr T1, CO - addi T2, T1, 64 +.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm -#ifndef TRMMKERNEL +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - lxv vs20, 0(T2) - lxv vs21, 16(T2) - lxv vs22, 32(T2) - lxv vs23, 48(T2) + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21, vs20 -#endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - AGGREGATE_INTO_COMPLEX vs40,vs41,vs12 - AGGREGATE_INTO_COMPLEX vs42,vs43,vs13 - AGGREGATE_INTO_COMPLEX vs44,vs45,vs14 - AGGREGATE_INTO_COMPLEX vs46,vs47,vs15 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - xvadddp vs12, vs12, vs20 - xvadddp vs13, vs13, vs21 - xvadddp vs14, vs14, vs22 - xvadddp vs15, vs15, vs23 + lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -#endif + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 +.if \Complete==0 + lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs47, vs15, vs21 + +.endm - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - stxv vs12, 0(T2) - stxv vs13, 16(T2) - stxv vs14, 32(T2) - stxv vs15, 48(T2) +.macro KERNEL1x8 + LOAD1x8 0 + END1x8 AO, BO, 128,16 +.endm + +.macro SAVE1x8 - addi CO, CO, 128 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + addi CO, CO, 128 .endm @@ -1181,170 +1094,143 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 .endm .macro LOAD1x4 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17,vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A + lxv vs2, 32(AO) // load real,imag from A + lxv vs3, 48(AO) // load real,imag from A - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - .if \Zero==1 - Zero1x4 + Zero1x4 .endif .endm .macro END1x4_NORMAL - END1x4 AO,BO,64,32 + END1x4 AO,BO,64,16 .endm -.macro END1x4 AREG, BREG, OffsetA, OffsetB +.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 .endm -.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21,vs20 -lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - xvmaddadp vs36, vs2, vs16 // real*real, imag*real - xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag - xvmaddadp vs38, vs3, vs16 // real*real, imag*real - xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs40, vs0, vs18 // real*real, imag*real - xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag - xvmaddadp vs42, vs1, vs18 // real*real, imag*real - xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag - xvmaddadp vs44, vs2, vs18 // real*real, imag*real - xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag - xvmaddadp vs46, vs3, vs18 // real*real, imag*real - xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs41, vs0, vs19 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs43, vs1, vs19 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs47, vs3, vs19 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B +.if \Complete==0 + lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 +.if \Complete==0 + lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) .endif -.endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - xvmaddadp vs36, vs10, vs20 // real*real, imag*real - xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag - xvmaddadp vs38, vs11, vs20 // real*real, imag*real - xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - - xvmaddadp vs40, vs8, vs22 // real*real, imag*real - xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag - xvmaddadp vs42, vs9, vs22 // real*real, imag*real - xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag - xvmaddadp vs44, vs10, vs22 // real*real, imag*real - xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag - xvmaddadp vs46, vs11, vs22 // real*real, imag*real - xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +.endif + + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 + + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs41, vs8, vs23 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs43, vs9, vs23 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs47, vs11, vs23 .endm -.macro KERNEL1x4 +.macro KERNEL1x4 LOAD1x4 0 - END1x4 AO, BO, 64,32 + END1x4 AO, BO, 64,16 .endm .macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - lxv vs18, 32(T1) - lxv vs19, 48(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - AGGREGATE_INTO_COMPLEX vs36,vs37,vs10 - AGGREGATE_INTO_COMPLEX vs38,vs39,vs11 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - xvadddp vs10, vs10, vs18 - xvadddp vs11, vs11, vs19 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - stxv vs10, 32(T1) - stxv vs11, 48(T1) - - addi CO, CO, 64 + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + addi CO, CO, 64 .endm @@ -1353,122 +1239,99 @@ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B **********************************************************************************************/ .macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 .endm .macro LOAD1x2 Zero - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17,vs16 + lxv vs0, 0(AO) // load real,imag from A + lxv vs1, 16(AO) // load real,imag from A .if \Zero==1 - Zero1x2 + Zero1x2 .endif .endm .macro END1x2_NORMAL - END1x2 AO,BO,32,32 + END1x2 AO,BO,32,16 .endm -.macro END1x2 AREG, BREG, OffsetA, OffsetB +.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA .endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + .endm -.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21,vs20 -lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - xvmaddadp vs34, vs1, vs16 // real*real, imag*real - xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag -.if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A + lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A .endif - -.if \IsLast==1 +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17,vs16 +.endif +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif .endif -.endif - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - xvmaddadp vs34, vs9, vs20 // real*real, imag*real - xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 .endm -.macro KERNEL1x2 +.macro KERNEL1x2 LOAD1x2 0 - END1x2 AO, BO, 32,32 + END1x2 AO, BO, 32,16 .endm .macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - - lxv vs16, 0(T1) - lxv vs17, 16(T1) - -#endif - - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 - -#ifndef TRMMKERNEL - - xvadddp vs8, vs8, vs16 - xvadddp vs9, vs9, vs17 - -#endif - - stxv vs8, 0(T1) - stxv vs9, 16(T1) - -addi CO, CO, 32 - + SAVE2 vs32,vs33,vs34,vs35,CO,0 + addi CO, CO, 32 .endm /********************************************************************************************** @@ -1476,189 +1339,89 @@ addi CO, CO, 32 **********************************************************************************************/ .macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 .endm .macro LOAD1x1 Zero - lxv vs0, 0(AO) // load real,imag from A - - lxv vs16, 0(BO) // load real part from B - lxv vs17, 16(BO) // load imag part from B + lxv vs0, 0(AO) // load real,imag from A + lxv vs16, 0(BO) // load real imag from B + xxswapd vs17, vs16 .if \Zero==1 - Zero1x1 + Zero1x1 .endif - + .endm .macro END1x1_NORMAL - END1x1 AO,BO,16,32 + END1x1 AO,BO,16,16 .endm -.macro END1x1 AREG, BREG, OffsetA, OffsetB +.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB .endif - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag - - -.endm + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 -.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm -.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B - lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B + xxswapd vs21, vs20 - xvmaddadp vs32, vs0, vs16 // real*real, imag*real - xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 .if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B - lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B + lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B + xxswapd vs17, vs16 .endif - -.if \IsLast==1 +.if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) .endif .endif - - xvmaddadp vs32, vs8, vs20 // real*real, imag*real - xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag - - -.endm - -.macro KERNEL1x1 - LOAD1x1 0 - END1x1 AO, BO, 16,32 - -.endm -.macro SAVE1x1 + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 - mr T1, CO -#ifndef TRMMKERNEL - lxv vs16, 0(T1) -#endif - AGGREGATE_INTO_COMPLEX vs32,vs33,vs8 - -#ifndef TRMMKERNEL - xvadddp vs8, vs8, vs16 -#endif - - stxv vs8, 0(T1) - -addi CO, CO, 16 - -.endm - - -.macro ZCOPYB_2 - - lxv vs32, 0(BO) - lxv vs33, 16(BO) - addi BO, BO, 32 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - xxspltd vs42, vs33, 1 - xxspltd vs43, vs33, 0 - - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - stxv vs42, 32(BBO) - stxv vs43, 48(BBO) - addi BBO, BBO, 64 .endm -.macro ZCOPYB_1 - - lxv vs32, 0(BO) - addi BO, BO, 16 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - - addi BBO, BBO, 32 +.macro KERNEL1x1 + LOAD1x1 0 + END1x1 AO, BO, 16,16 .endm -.macro ZCOPYB_8 - - lxv vs32, 0(BO) - lxv vs33, 16(BO) - lxv vs34, 32(BO) - lxv vs35, 48(BO) - - lxv vs36, 64+0(BO) - lxv vs37, 64+16(BO) - lxv vs38, 64+32(BO) - lxv vs39, 64+48(BO) - addi BO, BO, 128 - xxspltd vs40, vs32, 1 - xxspltd vs41, vs32, 0 - xxspltd vs42, vs33, 1 - xxspltd vs43, vs33, 0 - xxspltd vs44, vs34, 1 - xxspltd vs45, vs34, 0 - xxspltd vs46, vs35, 1 - xxspltd vs47, vs35, 0 - - xxspltd vs48, vs36, 1 - xxspltd vs49, vs36, 0 - xxspltd vs50, vs37, 1 - xxspltd vs51, vs37, 0 - xxspltd vs52, vs38, 1 - xxspltd vs53, vs38, 0 - xxspltd vs54, vs39, 1 - xxspltd vs55, vs39, 0 - - stxv vs40, 0(BBO) - stxv vs41, 16(BBO) - stxv vs42, 32(BBO) - stxv vs43, 48(BBO) - - stxv vs44, 64+0(BBO) - stxv vs45, 64+16(BBO) - stxv vs46, 64+32(BBO) - stxv vs47, 64+48(BBO) - - stxv vs48, 128+ 0(BBO) - stxv vs49, 128+ 16(BBO) - stxv vs50, 128+ 32(BBO) - stxv vs51, 128+ 48(BBO) - - stxv vs52, 192 + 0(BBO) - stxv vs53, 192 + 16(BBO) - stxv vs54, 192+ 32(BBO) - stxv vs55, 192 + 48(BBO) - addi BBO, BBO, 256 - +.macro SAVE1x1 + SAVE1 vs32,vs33,CO,0 + addi CO, CO, 16 .endm diff --git a/param.h b/param.h index d0b8518c9..8f78a6a64 100644 --- a/param.h +++ b/param.h @@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 640 +#define SGEMM_DEFAULT_P 832 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1408 +#define SGEMM_DEFAULT_Q 1025 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 1152 +#define ZGEMM_DEFAULT_Q 1025 #define SYMV_P 8 From 7a9a4dbc4fdd748747cd86ae685e760ae8cdc10f Mon Sep 17 00:00:00 2001 From: Michael Lass Date: Fri, 3 May 2019 21:07:14 +0200 Subject: [PATCH 615/935] Fix detection of AVX512 capable compilers in getarch 21eda8b5 introduced a check in getarch.c to test if the compiler is capable of AVX512. This check currently fails, since the used __AVX2__ macro is only defined if getarch itself was compiled with AVX2/AVX512 support. Make sure this is the case by building getarch with -march=native on x86_64. It is only supposed to run on the build host anyway. --- Makefile.system | 9 +++++++++ cmake/system.cmake | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/Makefile.system b/Makefile.system index f574edf88..eb57cbb30 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,11 @@ ifndef TOPDIR TOPDIR = . endif +# If ARCH is not set, we use the host system's architecture. +ifndef ARCH +ARCH := $(shell uname -m) +endif + # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 @@ -137,6 +142,10 @@ endif endif +# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +ifeq ($(ARCH), x86_64) +GETARCH_FLAGS += -march=native +endif ifdef INTERFACE64 diff --git a/cmake/system.cmake b/cmake/system.cmake index adedd32cc..7f3696286 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -65,6 +65,11 @@ if (DEFINED TARGET) set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () +# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +if (X86_64) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") From 9cdc828afa3b209c2c74a7d9daa5fac85bece49f Mon Sep 17 00:00:00 2001 From: Michael Lass Date: Fri, 3 May 2019 21:22:27 +0200 Subject: [PATCH 616/935] c_check: Unlink correct file --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index d93b756d5..271182c54 100644 --- a/c_check +++ b/c_check @@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } else { $no_avx512 = 0; } - unlink("tmpf.o"); + unlink("$tmpf.o"); } } From d0c3543c3f38bbbdf363caa8c37bcf6df5bdb6fd Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Wed, 5 Jun 2019 10:30:57 +0000 Subject: [PATCH 617/935] power9 zgemm ztrmm optimized --- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/zgemm_kernel_power9.S | 2 +- kernel/power/zgemm_logic_power9.S | 2336 +++++++++++++++++++--------- kernel/power/zgemm_macros_power9.S | 1562 ++++++++++++------- param.h | 2 +- 5 files changed, 2598 insertions(+), 1306 deletions(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 440eaab1b..126313c9a 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -6,7 +6,7 @@ STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index a41bcec77..813f270b8 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T8 r16 #define T5 r17 #define T2 r19 -#define T9 r20 +#define TEMP_REG r20 #define T6 r21 #define I r22 #define J r23 diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index 01685fe79..f902484a3 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -26,972 +26,1866 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define MY_ALIGN .align 3 b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ -/* MINI SUBROUTINES */ - - -/* 2x8 MAIN 128x+1 LOOP */ -ZGEMM_L2x8_LMAIN_SUB: - mtctr L - LOAD2x8 0 - MY_ALIGN +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN ZGEMM_L2x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,0 - KERNEL2x8_L 128,32,16,0 - KERNEL2x8_L 128,32,17,0 - KERNEL2x8_L 128,32,18,0 - KERNEL2x8_L 128,32,19,0 - KERNEL2x8_L 128,32,20,0 - KERNEL2x8_L 128,32,21,0 - KERNEL2x8_L 128,32,22,0 - KERNEL2x8_L 128,32,23,0 - KERNEL2x8_L 128,32,24,0 - KERNEL2x8_L 128,32,25,0 - KERNEL2x8_L 128,32,26,0 - KERNEL2x8_L 128,32,27,0 - KERNEL2x8_L 128,32,28,0 - KERNEL2x8_L 128,32,29,0 - KERNEL2x8_L 128,32,30,0 - KERNEL2x8_L 128,32,31,0 - KERNEL2x8_L 128,32,32,0 - KERNEL2x8_L 128,32,33,0 - KERNEL2x8_L 128,32,34,0 - KERNEL2x8_L 128,32,35,0 - KERNEL2x8_L 128,32,36,0 - KERNEL2x8_L 128,32,37,0 - KERNEL2x8_L 128,32,38,0 - KERNEL2x8_L 128,32,39,0 - KERNEL2x8_L 128,32,40,0 - KERNEL2x8_L 128,32,41,0 - KERNEL2x8_L 128,32,42,0 - KERNEL2x8_L 128,32,43,0 - KERNEL2x8_L 128,32,44,0 - KERNEL2x8_L 128,32,45,0 - KERNEL2x8_L 128,32,46,0 - KERNEL2x8_L 128,32,47,0 - KERNEL2x8_L 128,32,48,0 - KERNEL2x8_L 128,32,49,0 - KERNEL2x8_L 128,32,50,0 - KERNEL2x8_L 128,32,51,0 - KERNEL2x8_L 128,32,52,0 - KERNEL2x8_L 128,32,53,0 - KERNEL2x8_L 128,32,54,0 - KERNEL2x8_L 128,32,55,0 - KERNEL2x8_L 128,32,56,0 - KERNEL2x8_L 128,32,57,0 - KERNEL2x8_L 128,32,58,0 - KERNEL2x8_L 128,32,59,0 - KERNEL2x8_L 128,32,60,0 - KERNEL2x8_L 128,32,61,0 - KERNEL2x8_L 128,32,62,0 - KERNEL2x8_L 128,32,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_L2 256,64,31,0 + KERNEL2x8_L2 256,64,32,0 + KERNEL2x8_L2 256,64,33,0 + KERNEL2x8_L2 256,64,34,0 + KERNEL2x8_L2 256,64,35,0 + KERNEL2x8_L2 256,64,36,0 + KERNEL2x8_L2 256,64,37,0 + KERNEL2x8_L2 256,64,38,0 + KERNEL2x8_L2 256,64,39,0 + KERNEL2x8_L2 256,64,40,0 + KERNEL2x8_L2 256,64,41,0 + KERNEL2x8_L2 256,64,42,0 + KERNEL2x8_L2 256,64,43,0 + KERNEL2x8_L2 256,64,44,0 + KERNEL2x8_L2 256,64,45,0 + KERNEL2x8_L2 256,64,46,0 + KERNEL2x8_L2 256,64,47,0 + KERNEL2x8_L2 256,64,48,0 + KERNEL2x8_L2 256,64,49,0 + KERNEL2x8_L2 256,64,50,0 + KERNEL2x8_L2 256,64,51,0 + KERNEL2x8_L2 256,64,52,0 + KERNEL2x8_L2 256,64,53,0 + KERNEL2x8_L2 256,64,54,0 + KERNEL2x8_L2 256,64,55,0 + KERNEL2x8_L2 256,64,56,0 + KERNEL2x8_L2 256,64,57,0 + KERNEL2x8_L2 256,64,58,0 + KERNEL2x8_L2 256,64,59,0 + KERNEL2x8_L2 256,64,60,0 + KERNEL2x8_L2 256,64,61,0 + KERNEL2x8_L2 256,64,62,0 + KERNEL2x8_L2 256,64,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN ZGEMM_L2x8_LOOP_END: - END2x8 AO, BO, 128,32 - blr - +/*----------------------------------------*/ + END2x8_2 + blr MY_ALIGN -ZGEMM_2x8_L64_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,0 - KERNEL2x8_L 128,32,16,0 - KERNEL2x8_L 128,32,17,0 - KERNEL2x8_L 128,32,18,0 - KERNEL2x8_L 128,32,19,0 - KERNEL2x8_L 128,32,20,0 - KERNEL2x8_L 128,32,21,0 - KERNEL2x8_L 128,32,22,0 - KERNEL2x8_L 128,32,23,0 - KERNEL2x8_L 128,32,24,0 - KERNEL2x8_L 128,32,25,0 - KERNEL2x8_L 128,32,26,0 - KERNEL2x8_L 128,32,27,0 - KERNEL2x8_L 128,32,28,0 - KERNEL2x8_L 128,32,29,0 - KERNEL2x8_L 128,32,30,0 - KERNEL2x8_E 128,32,31,1 - blr +ZGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_E2 256,64,31,1 + blr MY_ALIGN + + ZGEMM_2x8_L32_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,1 - blr +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_E2 256,64,15,1 + blr MY_ALIGN + ZGEMM_2x8_L16_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,1 - blr - MY_ALIGN +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_E2 256,64,7,1 + blr + MY_ALIGN + ZGEMM_2x4_LMAIN_SUB: - mtctr L - LOAD2x4 0 - MY_ALIGN -ZGEMM_L2x4_LOOP: - KERNEL2x4_L 64,32,0,0 - KERNEL2x4_L 64,32,1,0 - KERNEL2x4_L 64,32,2,0 - KERNEL2x4_L 64,32,3,0 - KERNEL2x4_L 64,32,4,0 - KERNEL2x4_L 64,32,5,0 - KERNEL2x4_L 64,32,6,0 - KERNEL2x4_L 64,32,7,0 - KERNEL2x4_L 64,32,8,0 - KERNEL2x4_L 64,32,9,0 - KERNEL2x4_L 64,32,10,0 - KERNEL2x4_L 64,32,11,0 - KERNEL2x4_L 64,32,12,0 - KERNEL2x4_L 64,32,13,0 - KERNEL2x4_L 64,32,14,0 - KERNEL2x4_L 64,32,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,0,0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_L2 128,64,7,0 + KERNEL2x4_L2 128,64,8,0 + KERNEL2x4_L2 128,64,9,0 + KERNEL2x4_L2 128,64,10,0 + KERNEL2x4_L2 128,64,11,0 + KERNEL2x4_L2 128,64,12,0 + KERNEL2x4_L2 128,64,13,0 + KERNEL2x4_L2 128,64,14,0 + KERNEL2x4_L2 128,64,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN ZGEMM_L2x4_LOOP_END: - END2x4 AO, BO, 64,32 - blr - +/*----------------------------------------*/ + END2x4_2 + blr MY_ALIGN + + ZGEMM_2x4_L16_SUB: - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_L 64,32, 1,0 - KERNEL2x4_L 64,32, 2,0 - KERNEL2x4_L 64,32, 3,0 - KERNEL2x4_L 64,32, 4,0 - KERNEL2x4_L 64,32, 5,0 - KERNEL2x4_L 64,32, 6,0 - KERNEL2x4_E 64,32, 7,1 +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_E2 128,64,7,1 blr - MY_ALIGN + + ZGEMM_2x4_L8_SUB: - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_L 64,32, 1,0 - KERNEL2x4_L 64,32, 2,0 - KERNEL2x4_E 64,32, 3,1 +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_E2 128,64,3,1 + blr + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,0,0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_L2 64,64,7,0 + KERNEL2x2_L2 64,64,8,0 + KERNEL2x2_L2 64,64,9,0 + KERNEL2x2_L2 64,64,10,0 + KERNEL2x2_L2 64,64,11,0 + KERNEL2x2_L2 64,64,12,0 + KERNEL2x2_L2 64,64,13,0 + KERNEL2x2_L2 64,64,14,0 + KERNEL2x2_L2 64,64,15,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +ZGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_E2 64,64,7,1 + blr + MY_ALIGN +ZGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_E2 64,64,3,1 + blr + + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,0,0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_L2 32,64,7,0 + KERNEL2x1_L2 32,64,8,0 + KERNEL2x1_L2 32,64,9,0 + KERNEL2x1_L2 32,64,10,0 + KERNEL2x1_L2 32,64,11,0 + KERNEL2x1_L2 32,64,12,0 + KERNEL2x1_L2 32,64,13,0 + KERNEL2x1_L2 32,64,14,0 + KERNEL2x1_L2 32,64,15,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +ZGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_E2 32,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_E2 32,64,3,1 blr -/* MAIN LOOP BEGINS */ - MY_ALIGN + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + ZGEMM_L2: - srawi. J, N, 1 - ble ZGEMM_L2_END +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + ble ZGEMM_L2_END + ZGEMM_L2_BEGIN: - mr CO, C - slwi T1, LDC , 1 +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 add T2,C,LDC - mr AO, A - add C, C, T1 - srawi. I, M, 3 - ble ZGEMM_L2x8_END + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L2x8_END dcbt CO,r0 /*just prefetch*/ dcbt T2,r0 -ZGEMM_L2x8_BEGIN: - mr T1, K - mr BO, B - dcbt B, r0 - dcbt AO, r0 - /* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - - addi T1,T1, -1 - /* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. L, T1, 7 /**(K-1) % 128x */ - - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - -ZGEMM_L2x8_SUB0: - andi. L, K, 255 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8O 128,32 + END2x8_WITHOUT_ADD + LOAD2x8_2O 256, 64 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else cmpwi K,128 - bne ZGEMM_L2x8_SUB2 - MY_ALIGN -ZGEMM_L2x8_SUB2_128: - bl ZGEMM_2x8_L64_SUB - bl ZGEMM_2x8_L64_SUB - b ZGEMM_L2x8_SAVE +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-256 + LOAD2x8_2O 256,64 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE MY_ALIGN + + ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB MY_ALIGN + + ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB MY_ALIGN + + ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ andi. T1,L, 16 ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN + bl ZGEMM_2x8_L16_SUB + MY_ALIGN + + ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ andi. T1,L, 8 ble ZGEMM_L2x8_SUB2_4 - LOAD2x8 0 - KERNEL2x8_L 128,32, 0,0 - KERNEL2x8_L 128,32, 1,0 - KERNEL2x8_L 128,32, 2,0 - KERNEL2x8_E 128,32, 3,1 - MY_ALIGN + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_L2 256,64, 1,0 + KERNEL2x8_L2 256,64, 2,0 + KERNEL2x8_E2 256,64, 3,1 + MY_ALIGN + + ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x8_SUB2_2 - LOAD2x8 0 - KERNEL2x8_L 128,32, 0,0 - KERNEL2x8_E 128,32, 1,1 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_E2 256,64, 1,1 MY_ALIGN + + ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x8_SUB2_1 - LOAD2x8 0 - KERNEL2x8_E 128,32, 0,1 + LOAD2x8_2 + KERNEL2x8_E2 256,64, 0,1 MY_ALIGN + + ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 -ZGEMM_L2x8_SAVE: - addic. I, I, -1 - SAVE2x8 - bgt ZGEMM_L2x8_BEGIN +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN ZGEMM_L2x8_END: +/*----------------------------------------*/ -ZGEMM_L2x4_BEGIN: - - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END - mr BO, B - mr T1, K - addi T1,T1, -1 - ZERO2x4 - srawi. L, T1, 5 /**(K-1) % 32x */ - - ble ZGEMM_L2x4_SUB0 +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble ZGEMM_L2x4_SUB0 bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + ZGEMM_L2x4_SUB0: - andi. L, K, 63 +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4O 64,32 + END2x4_WITHOUT_ADD + LOAD2x4_2O 128, 64 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else cmpwi K,32 - bne ZGEMM_L2x4_SUB2 - MY_ALIGN -ZGEMM_L2x4_SUB2_32: - bl ZGEMM_2x4_L16_SUB - bl ZGEMM_2x4_L16_SUB - b ZGEMM_L2x4_SAVE +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD2x4_2O 128,64 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN MY_ALIGN -ZGEMM_L2x4_SUB2: + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ andi. T1,L, 16 ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB + bl ZGEMM_2x4_L16_SUB MY_ALIGN -ZGEMM_L2x4_SUB2_8: + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ andi. T1,L, 8 ble ZGEMM_L2x4_SUB2_4 bl ZGEMM_2x4_L8_SUB MY_ALIGN + + ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x4_SUB2_2 - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_E 64,32, 1,1 + LOAD2x4_2 + KERNEL2x4_L2 128,64, 0,0 + KERNEL2x4_E2 128,64, 1,1 MY_ALIGN + + ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x4_SUB2_1 - LOAD2x4 0 - KERNEL2x4_E 64,32, 0,1 + LOAD2x4_2 + KERNEL2x4_E2 128,64, 0,1 MY_ALIGN + + ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif - SAVE2x4 ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + -ZGEMM_L2x2_BEGIN: - - andi. T1, M, 2 - ble ZGEMM_L2x2_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x2 - ble ZGEMM_L2x2_SUB0 - -ZGEMM_L2x2_LOOP_START: - LOAD2x2 0 - mtctr L - - MY_ALIGN -ZGEMM_L2x2_LOOP: - KERNEL2x2_L 32,32,0,0 - KERNEL2x2_L 32,32,1,0 - KERNEL2x2_L 32,32,2,0 - KERNEL2x2_L 32,32,3,0 - KERNEL2x2_L 32,32,4,0 - KERNEL2x2_L 32,32,5,0 - KERNEL2x2_L 32,32,6,0 - KERNEL2x2_L 32,32,7,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN -ZGEMM_L2x2_LOOP_END: - END2x2 AO, BO, 32,32 - - b ZGEMM_L2x2_SUB1 - ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2O 32,32 + END2x2_WITHOUT_ADD + LOAD2x2_2O 64, 64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD2x2_2O 64,64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 31 - - b ZGEMM_L2x2_SUB2 -ZGEMM_L2x2_SUB1: +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + bl ZGEMM_2x2_L16_SUB + MY_ALIGN - andi. L, T1, 15 - ble ZGEMM_L2x2_SAVE -ZGEMM_L2x2_SUB2: - srawi. T1,L, 3 +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 ble ZGEMM_L2x2_SUB2_4 - mtctr T1 - MY_ALIGN -ZGEMM_L2x2_SUB2_LOOP: - LOAD2x2 0 - KERNEL2x2_L 32,32, 0,0 - KERNEL2x2_L 32,32, 1,0 - KERNEL2x2_L 32,32, 2,0 - KERNEL2x2_E 32,32, 3,1 - bdnz ZGEMM_L2x2_SUB2_LOOP + bl ZGEMM_2x2_L8_SUB MY_ALIGN + + ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x2_SUB2_2 - LOAD2x2 0 - KERNEL2x2_L 32,32, 0,0 - KERNEL2x2_E 32,32, 1,1 + LOAD2x2_2 + KERNEL2x2_L2 64,64, 0,0 + KERNEL2x2_E2 64,64, 1,1 MY_ALIGN + + ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x2_SUB2_1 - LOAD2x2 0 - KERNEL2x2_E 32,32, 0,1 + LOAD2x2_2 + KERNEL2x2_E2 64,64, 0,1 MY_ALIGN + + ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 + + ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif - SAVE2x2 ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 - -ZGEMM_L2x1_BEGIN: - andi. T1, M, 1 - ble ZGEMM_L2x1_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x1 - ble ZGEMM_L2x1_SUB0 - -ZGEMM_L2x1_LOOP_START: - - LOAD2x1 0 - mtctr L - - MY_ALIGN -ZGEMM_L2x1_LOOP: - KERNEL2x1_L 16,32,0,0 - KERNEL2x1_L 16,32,1,0 - KERNEL2x1_L 16,32,2,0 - KERNEL2x1_L 16,32,3,0 - KERNEL2x1_L 16,32,4,0 - KERNEL2x1_L 16,32,5,0 - KERNEL2x1_L 16,32,6,0 - KERNEL2x1_L 16,32,7,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: - END2x1 AO, BO, 16,32 - - b ZGEMM_L2x1_SUB1 - ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1O 16,32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD2x1_2O 32,64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 31 - - b ZGEMM_L2x1_SUB2 -ZGEMM_L2x1_SUB1: +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + bl ZGEMM_2x1_L16_SUB + MY_ALIGN - andi. L, T1, 15 - ble ZGEMM_L2x1_SAVE -ZGEMM_L2x1_SUB2: - srawi. T1,L, 3 +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 ble ZGEMM_L2x1_SUB2_4 - mtctr T1 - MY_ALIGN -ZGEMM_L2x1_SUB2_LOOP: - LOAD2x1 0 - KERNEL2x1_L 16,32, 0,0 - KERNEL2x1_L 16,32, 1,0 - KERNEL2x1_L 16,32, 2,0 - KERNEL2x1_E 16,32, 3,1 - bdnz ZGEMM_L2x1_SUB2_LOOP + bl ZGEMM_2x1_L8_SUB MY_ALIGN + + ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x1_SUB2_2 - LOAD2x1 0 - KERNEL2x1_L 16,32, 0,0 - KERNEL2x1_E 16,32, 1,1 + LOAD2x1_2 + KERNEL2x1_L2 32,64, 0,0 + KERNEL2x1_E2 32,64, 1,1 MY_ALIGN + + ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x1_SUB2_1 - LOAD2x1 0 - KERNEL2x1_E 16,32, 0,1 + LOAD2x1_2 + KERNEL2x1_E2 32,64, 0,1 MY_ALIGN + + ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif - SAVE2x1 ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + bgt ZGEMM_L2_BEGIN - slwi T1, K, 5 - add B, B, T1 - addic. J, J, -1 - bgt ZGEMM_L2_BEGIN +ZGEMM_L2_END: - andi. T2, N, 1 - ble L999 +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ -ZGEMM_L2_END: - b ZGEMM_L1_BEGIN +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_L2 256,32,31,0 + KERNEL1x8_L2 256,32,32,0 + KERNEL1x8_L2 256,32,33,0 + KERNEL1x8_L2 256,32,34,0 + KERNEL1x8_L2 256,32,35,0 + KERNEL1x8_L2 256,32,36,0 + KERNEL1x8_L2 256,32,37,0 + KERNEL1x8_L2 256,32,38,0 + KERNEL1x8_L2 256,32,39,0 + KERNEL1x8_L2 256,32,40,0 + KERNEL1x8_L2 256,32,41,0 + KERNEL1x8_L2 256,32,42,0 + KERNEL1x8_L2 256,32,43,0 + KERNEL1x8_L2 256,32,44,0 + KERNEL1x8_L2 256,32,45,0 + KERNEL1x8_L2 256,32,46,0 + KERNEL1x8_L2 256,32,47,0 + KERNEL1x8_L2 256,32,48,0 + KERNEL1x8_L2 256,32,49,0 + KERNEL1x8_L2 256,32,50,0 + KERNEL1x8_L2 256,32,51,0 + KERNEL1x8_L2 256,32,52,0 + KERNEL1x8_L2 256,32,53,0 + KERNEL1x8_L2 256,32,54,0 + KERNEL1x8_L2 256,32,55,0 + KERNEL1x8_L2 256,32,56,0 + KERNEL1x8_L2 256,32,57,0 + KERNEL1x8_L2 256,32,58,0 + KERNEL1x8_L2 256,32,59,0 + KERNEL1x8_L2 256,32,60,0 + KERNEL1x8_L2 256,32,61,0 + KERNEL1x8_L2 256,32,62,0 + KERNEL1x8_L2 256,32,63,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + -L999_H1: +ZGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_E2 256,32,31,1 + blr + MY_ALIGN - b L999 -ZGEMM_L1_BEGIN: - andi. T1, N, 1 - ble ZGEMM_L1_END +ZGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_E2 256,32,15,1 + blr + MY_ALIGN - mr CO, C - mr AO, A - srawi. I, M, 3 - ble ZGEMM_L1x8_END -ZGEMM_L1x8_BEGIN: +ZGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_E2 256,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,0,0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_L2 128,32,7,0 + KERNEL1x4_L2 128,32,8,0 + KERNEL1x4_L2 128,32,9,0 + KERNEL1x4_L2 128,32,10,0 + KERNEL1x4_L2 128,32,11,0 + KERNEL1x4_L2 128,32,12,0 + KERNEL1x4_L2 128,32,13,0 + KERNEL1x4_L2 128,32,14,0 + KERNEL1x4_L2 128,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +ZGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_E2 128,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_E2 128,32,3,1 + blr + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,0,0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_L2 64,32,7,0 + KERNEL1x2_L2 64,32,8,0 + KERNEL1x2_L2 64,32,9,0 + KERNEL1x2_L2 64,32,10,0 + KERNEL1x2_L2 64,32,11,0 + KERNEL1x2_L2 64,32,12,0 + KERNEL1x2_L2 64,32,13,0 + KERNEL1x2_L2 64,32,14,0 + KERNEL1x2_L2 64,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN + + +ZGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_E2 64,32,7,1 + blr + MY_ALIGN + +ZGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_E2 64,32,3,1 + blr + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,0,0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_L2 32,32,7,0 + KERNEL1x1_L2 32,32,8,0 + KERNEL1x1_L2 32,32,9,0 + KERNEL1x1_L2 32,32,10,0 + KERNEL1x1_L2 32,32,11,0 + KERNEL1x1_L2 32,32,12,0 + KERNEL1x1_L2 32,32,13,0 + KERNEL1x1_L2 32,32,14,0 + KERNEL1x1_L2 32,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +ZGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_E2 32,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_E2 32,32,3,1 + blr - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 32x */ - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - -ZGEMM_L1x8_LOOP_START: +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 - LOAD1x8 0 + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ li T2, 1024 - li T3, 1024+512 - li T4, 2048 - li T5, 2048+512 - mtctr L + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + - MY_ALIGN -ZGEMM_L1x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L 128,16,0,0 - KERNEL1x8_L 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L 128,16,2,0 - KERNEL1x8_L 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L 128,16,4,0 - KERNEL1x8_L 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L 128,16,6,0 - KERNEL1x8_L 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L 128,16,8,0 - KERNEL1x8_L 128,16,9,0 - KERNEL1x8_L 128,16,10,0 - KERNEL1x8_L 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L 128,16,12,0 - KERNEL1x8_L 128,16,13,0 - KERNEL1x8_L 128,16,14,0 - KERNEL1x8_L 128,16,15,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: - END1x8 AO, BO, 128,16 - - b ZGEMM_L1x8_SUB1 - ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8O 128,16 + END1x8_WITHOUT_ADD + LOAD1x8_2O 256, 32 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-256 + LOAD1x8_2O 256,32 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x8_SUB2 -ZGEMM_L1x8_SUB1: +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + bl ZGEMM_1x8_L64_SUB + MY_ALIGN - andi. L, T1, 31 - ble ZGEMM_L1x8_SAVE -ZGEMM_L1x8_SUB2: - srawi. T1,L, 3 +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + bl ZGEMM_1x8_L32_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + bl ZGEMM_1x8_L16_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 ble ZGEMM_L1x8_SUB2_4 - mtctr T1 - MY_ALIGN -ZGEMM_L1x8_SUB2_LOOP: - LOAD1x8 0 - KERNEL1x8_L 128,16, 0,0 - KERNEL1x8_L 128,16, 1,0 - KERNEL1x8_L 128,16, 2,0 - KERNEL1x8_E 128,16, 3,1 - bdnz ZGEMM_L1x8_SUB2_LOOP - MY_ALIGN + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_L2 256,32, 1,0 + KERNEL1x8_L2 256,32, 2,0 + KERNEL1x8_E2 256,32, 3,1 + MY_ALIGN + + ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x8_SUB2_2 - LOAD1x8 0 - KERNEL1x8_L 128,16, 0,0 - KERNEL1x8_E 128,16, 1,1 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_E2 256,32, 1,1 MY_ALIGN + + ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x8_SUB2_1 - LOAD1x8 0 - KERNEL1x8_E 128,16, 0,1 + LOAD1x8_2 + KERNEL1x8_E2 256,32, 0,1 MY_ALIGN + + ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - + ble ZGEMM_L1x8_SAVE + KERNEL1x8 -ZGEMM_L1x8_SAVE: - SAVE1x8 +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN - addic. I, I, -1 - bgt ZGEMM_L1x8_BEGIN ZGEMM_L1x8_END: +/*----------------------------------------*/ + ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + - andi. T2, M, 7 - ble ZGEMM_L1x1_END - - andi. T1, M, 4 - ble ZGEMM_L1x4_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - -ZGEMM_L1x4_LOOP_START: - LOAD1x4 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x4_LOOP: - KERNEL1x4_L 64,16,0,0 - KERNEL1x4_L 64,16,1,0 - KERNEL1x4_L 64,16,2,0 - KERNEL1x4_L 64,16,3,0 - KERNEL1x4_L 64,16,4,0 - KERNEL1x4_L 64,16,5,0 - KERNEL1x4_L 64,16,6,0 - KERNEL1x4_L 64,16,7,0 - KERNEL1x4_L 64,16,8,0 - KERNEL1x4_L 64,16,9,0 - KERNEL1x4_L 64,16,10,0 - KERNEL1x4_L 64,16,11,0 - KERNEL1x4_L 64,16,12,0 - KERNEL1x4_L 64,16,13,0 - KERNEL1x4_L 64,16,14,0 - KERNEL1x4_L 64,16,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN -ZGEMM_L1x4_LOOP_END: - END1x4 AO, BO, 64,16 - - b ZGEMM_L1x4_SUB1 - ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4O 64,16 + END1x4_WITHOUT_ADD + LOAD1x4_2O 128, 32 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD1x4_2O 128,32 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x4_SUB2 -ZGEMM_L1x4_SUB1: +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + bl ZGEMM_1x4_L16_SUB + MY_ALIGN - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE -ZGEMM_L1x4_SUB2: - srawi. T1,L, 3 +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 ble ZGEMM_L1x4_SUB2_4 - mtctr T1 - MY_ALIGN -ZGEMM_L1x4_SUB2_LOOP: - LOAD1x4 0 - KERNEL1x4_L 64,16, 0,0 - KERNEL1x4_L 64,16, 1,0 - KERNEL1x4_L 64,16, 2,0 - KERNEL1x4_E 64,16, 3,1 - bdnz ZGEMM_L1x4_SUB2_LOOP + bl ZGEMM_1x4_L8_SUB MY_ALIGN + + ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x4_SUB2_2 - LOAD1x4 0 - KERNEL1x4_L 64,16, 0,0 - KERNEL1x4_E 64,16, 1,1 + LOAD1x4_2 + KERNEL1x4_L2 128,32, 0,0 + KERNEL1x4_E2 128,32, 1,1 MY_ALIGN + + ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x4_SUB2_1 - LOAD1x4 0 - KERNEL1x4_E 64,16, 0,1 + LOAD1x4_2 + KERNEL1x4_E2 128,32, 0,1 MY_ALIGN + + ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif - SAVE1x4 ZGEMM_L1x4_END: +/*----------------------------------------*/ + ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 - andi. T1, M, 2 - ble ZGEMM_L1x2_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - -ZGEMM_L1x2_LOOP_START: - LOAD1x2 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x2_LOOP: - KERNEL1x2_L 32,16,0,0 - KERNEL1x2_L 32,16,1,0 - KERNEL1x2_L 32,16,2,0 - KERNEL1x2_L 32,16,3,0 - KERNEL1x2_L 32,16,4,0 - KERNEL1x2_L 32,16,5,0 - KERNEL1x2_L 32,16,6,0 - KERNEL1x2_L 32,16,7,0 - KERNEL1x2_L 32,16,8,0 - KERNEL1x2_L 32,16,9,0 - KERNEL1x2_L 32,16,10,0 - KERNEL1x2_L 32,16,11,0 - KERNEL1x2_L 32,16,12,0 - KERNEL1x2_L 32,16,13,0 - KERNEL1x2_L 32,16,14,0 - KERNEL1x2_L 32,16,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN -ZGEMM_L1x2_LOOP_END: - END1x2 AO, BO, 32,16 - - b ZGEMM_L1x2_SUB1 - ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2O 32,16 + END1x2_WITHOUT_ADD + LOAD1x2_2O 64, 32 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD1x2_2O 64,32 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x2_SUB2 -ZGEMM_L1x2_SUB1: +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + bl ZGEMM_1x2_L16_SUB + MY_ALIGN - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE -ZGEMM_L1x2_SUB2: - srawi. T1,L, 3 +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 ble ZGEMM_L1x2_SUB2_4 - mtctr T1 - MY_ALIGN -ZGEMM_L1x2_SUB2_LOOP: - LOAD1x2 0 - KERNEL1x2_L 32,16, 0,0 - KERNEL1x2_L 32,16, 1,0 - KERNEL1x2_L 32,16, 2,0 - KERNEL1x2_E 32,16, 3,1 - bdnz ZGEMM_L1x2_SUB2_LOOP + bl ZGEMM_1x2_L8_SUB MY_ALIGN + + ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x2_SUB2_2 - LOAD1x2 0 - KERNEL1x2_L 32,16, 0,0 - KERNEL1x2_E 32,16, 1,1 + LOAD1x2_2 + KERNEL1x2_L2 64,32, 0,0 + KERNEL1x2_E2 64,32, 1,1 MY_ALIGN + + ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x2_SUB2_1 - LOAD1x2 0 - KERNEL1x2_E 32,16, 0,1 + LOAD1x2_2 + KERNEL1x2_E2 64,32, 0,1 MY_ALIGN + + ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 + + ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif - SAVE1x2 ZGEMM_L1x2_END: +/*----------------------------------------*/ + ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 - andi. T1, M, 1 - ble ZGEMM_L1x1_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - -ZGEMM_L1x1_LOOP_START: - - LOAD1x1 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x1_LOOP: - KERNEL1x1_L 16,16,0,0 - KERNEL1x1_L 16,16,1,0 - KERNEL1x1_L 16,16,2,0 - KERNEL1x1_L 16,16,3,0 - KERNEL1x1_L 16,16,4,0 - KERNEL1x1_L 16,16,5,0 - KERNEL1x1_L 16,16,6,0 - KERNEL1x1_L 16,16,7,0 - KERNEL1x1_L 16,16,8,0 - KERNEL1x1_L 16,16,9,0 - KERNEL1x1_L 16,16,10,0 - KERNEL1x1_L 16,16,11,0 - KERNEL1x1_L 16,16,12,0 - KERNEL1x1_L 16,16,13,0 - KERNEL1x1_L 16,16,14,0 - KERNEL1x1_L 16,16,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN -ZGEMM_L1x1_LOOP_END: - END1x1 AO, BO, 16, 16 - - b ZGEMM_L1x1_SUB1 - ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1O 16,16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD1x1_2O 32,32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x1_SUB2 -ZGEMM_L1x1_SUB1: +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x1_SUB2_8 + bl ZGEMM_1x1_L16_SUB + MY_ALIGN - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE -ZGEMM_L1x1_SUB2: - srawi. T1,L, 3 +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 ble ZGEMM_L1x1_SUB2_4 - mtctr T1 - MY_ALIGN -ZGEMM_L1x1_SUB2_LOOP: - LOAD1x1 0 - KERNEL1x1_L 16,16, 0,0 - KERNEL1x1_L 16,16, 1,0 - KERNEL1x1_L 16,16, 2,0 - KERNEL1x1_E 16,16, 3,1 - bdnz ZGEMM_L1x1_SUB2_LOOP + bl ZGEMM_1x1_L8_SUB MY_ALIGN + + ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x1_SUB2_2 - LOAD1x1 0 - KERNEL1x1_L 16,16, 0,0 - KERNEL1x1_E 16,16, 1,1 + LOAD1x1_2 + KERNEL1x1_L2 32,32, 0,0 + KERNEL1x1_E2 32,32, 1,1 MY_ALIGN + + ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x1_SUB2_1 - LOAD1x1 0 - KERNEL1x1_E 16,16, 0,1 + LOAD1x1_2 + KERNEL1x1_E2 32,32, 0,1 MY_ALIGN + + ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif - SAVE1x1 ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + ZGEMM_L1_END: +/*----------------------------------------*/ + \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 10d9e4cc3..8670e9574 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -25,7 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #define unit_size 16 #define DISP32(ind,disp) (ind*unit_size*32+disp) #define DISP16(ind,disp) (ind*unit_size*16+disp) @@ -34,10 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) #define DISPX(disp) (disp) - /* HELPERS FOR SAVE */ - /* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + .macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET #ifndef TRMMKERNEL lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) @@ -46,20 +45,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 #endif .endm - /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ .endm - /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ .endm - /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + .macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI #if defined(NN) || defined(NT) || defined(TN) || defined(TT) xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR @@ -78,8 +80,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI #endif .endm - /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + .macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 #ifndef TRMMKERNEL xvmsubadp \VSOUT1,\VSINII, alpha_i @@ -89,23 +92,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp \VSOUT2,\VSINRR, alpha_i #endif .endm - /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + .macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 xvmsubadp \VSOUT1,\VSINRR, alpha_r xvmaddadp \VSOUT2,\VSINII, alpha_r .endm - /* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrghd \VSOUT1,\VSIN2,\VSIN1 xxmrgld \VSOUT2,\VSIN2,\VSIN1 .endm + + .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 stxv \VSIN1, DISPX(\LOFFSET)(\REG) stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) .endm + .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -141,6 +149,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 .endm + .macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -161,6 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + .macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -173,6 +183,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + .macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 #ifndef TRMMKERNEL @@ -188,9 +199,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghd vs7,vs15,vs14 stxv vs7, (\LOFFSET)(\BASE_REG) .endm - /********************************************************************************************** -* Macros for N=2 and M=8 +* + +.macros for N=2 and M=8 **********************************************************************************************/ .macro Zero2x8 @@ -228,269 +240,272 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs63, vs63, vs63 .endm -.macro LOAD2x8 Zero - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A +.macro END2x8_NORMAL + END2x8 AO,BO,128,32 +.endm -.if \Zero==1 - Zero2x8 -.endif +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 .endm -.macro END2x8_NORMAL - END2x8 AO,BO,128,32 -.endm .macro END2x8 AREG, BREG, OffsetA, OffsetB - .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs48, vs0, vs18 - + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 xvmaddadp vs34, vs1, vs16 xvmaddadp vs50, vs1, vs18 - + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 xvmaddadp vs36, vs2, vs16 xvmaddadp vs52, vs2, vs18 - + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 xvmaddadp vs38, vs3, vs16 xvmaddadp vs54, vs3, vs18 - + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 xvmaddadp vs40, vs4, vs16 xvmaddadp vs56, vs4, vs18 - + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 xvmaddadp vs42, vs5, vs16 xvmaddadp vs58, vs5, vs18 - + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 xvmaddadp vs44, vs6, vs16 xvmaddadp vs60, vs6, vs18 - + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 xvmaddadp vs46, vs7, vs16 xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.endm - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 - - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 - - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x8_2 + /*for load2 offset will be 256 and 64*/ + KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 +.endm + - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm -.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm .macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B xvmaddadp vs32, vs0, vs16 xvmaddadp vs48, vs0, vs18 xvmaddadp vs33, vs0, vs17 xvmaddadp vs49, vs0, vs19 - - xxswapd vs21, vs20 - xxswapd vs23, vs22 - + xxswapd vs21, vs20 + xxswapd vs23, vs22 xvmaddadp vs34, vs1, vs16 xvmaddadp vs50, vs1, vs18 - - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs35, vs1, vs17 xvmaddadp vs51, vs1, vs19 - - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs52, vs2, vs18 - - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs37, vs2, vs17 xvmaddadp vs53, vs2, vs19 - - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif - - xvmaddadp vs38, vs3, vs16 xvmaddadp vs54, vs3, vs18 - -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - xvmaddadp vs39, vs3, vs17 xvmaddadp vs55, vs3, vs19 - -.if \Complete==0 - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs4, vs16 xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 xvmaddadp vs58, vs5, vs18 xvmaddadp vs43, vs5, vs17 xvmaddadp vs59, vs5, vs19 - -.if \Complete==0 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs6, vs16 xvmaddadp vs60, vs6, vs18 xvmaddadp vs45, vs6, vs17 xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 xvmaddadp vs62, vs7, vs18 xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 - -.if \Complete==0 - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs63, vs7, vs19 +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs48, vs8, vs22 .if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 -.if \IsLast==1 - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif - -.endif + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs33, vs8, vs21 xvmaddadp vs49, vs8, vs23 - -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 .endif - xvmaddadp vs34, vs9, vs20 xvmaddadp vs50, vs9, vs22 xvmaddadp vs35, vs9, vs21 xvmaddadp vs51, vs9, vs23 - +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs52, vs10, vs22 xvmaddadp vs37, vs10, vs21 xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 xvmaddadp vs54, vs11, vs22 xvmaddadp vs39, vs11, vs21 xvmaddadp vs55, vs11, vs23 - +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs12, vs20 xvmaddadp vs56, vs12, vs22 xvmaddadp vs41, vs12, vs21 xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 xvmaddadp vs58, vs13, vs22 xvmaddadp vs43, vs13, vs21 xvmaddadp vs59, vs13, vs23 - +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs14, vs20 xvmaddadp vs60, vs14, vs22 xvmaddadp vs45, vs14, vs21 xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 xvmaddadp vs62, vs15, vs22 xvmaddadp vs47, vs15, vs21 xvmaddadp vs63, vs15, vs23 - +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .endm + + + + .macro KERNEL2x8 - LOAD2x8 0 + LOAD2x8 END2x8 AO, BO, 128,32 .endm -.macro SAVE2x8 +.macro SAVE2x8 add T1, CO ,LDC SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 addi CO, CO, 128 - .endm - /********************************************************************************************** -* Macros for N=2 and M=4 +* + +.macros for N=2 and M=4 **********************************************************************************************/ + .macro Zero2x4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -510,167 +525,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs47, vs47, vs47 .endm -.macro LOAD2x4 Zero - - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm -.if \Zero==1 - Zero2x4 -.endif +.macro LOAD2x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A .endm + .macro END2x4_NORMAL END2x4 AO,BO,64,32 .endm -.macro END2x4 AREG, BREG, OffsetA, OffsetB +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 xvmaddadp vs47, vs3, vs19 .endm -.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_2 + /*for load2 offset will be 128 and 64*/ + KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 xvmaddadp vs41, vs0, vs19 - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) -.endif -.endif - + xxswapd vs21, vs20 + xxswapd vs23, vs22 xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 xvmaddadp vs43, vs1, vs19 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A .endif + xvmaddadp vs36, vs2, vs16 xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 xvmaddadp vs47, vs3, vs19 - - .if \Complete==0 - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif xvmaddadp vs32, vs8, vs20 + xvmaddadp vs40, vs8, vs22 xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - - xvmaddadp vs40, vs8, vs22 xvmaddadp vs41, vs8, vs23 - +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 xvmaddadp vs42, vs9, vs22 + xvmaddadp vs35, vs9, vs21 xvmaddadp vs43, vs9, vs23 - +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 xvmaddadp vs44, vs10, vs22 + xvmaddadp vs37, vs10, vs21 xvmaddadp vs45, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 xvmaddadp vs46, vs11, vs22 + xvmaddadp vs39, vs11, vs21 xvmaddadp vs47, vs11, vs23 - +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .endm + + .macro KERNEL2x4 - LOAD2x4 0 + LOAD2x4 END2x4 AO, BO, 64,32 .endm + + .macro SAVE2x4 add T1, CO ,LDC SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 addi CO, CO, 64 - .endm - /********************************************************************************************** -* Macros for N=2 and M=2 +* + +.macros for N=2 and M=2 **********************************************************************************************/ + .macro Zero2x2 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -680,231 +727,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs37, vs37, vs37 xxlxor vs38, vs38, vs38 xxlxor vs39, vs39, vs39 -.endm -.macro LOAD2x2 Zero +.endm - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm -.if \Zero==1 - Zero2x2 -.endif +.macro LOAD2x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + .endm + .macro END2x2_NORMAL END2x2 AO,BO,32,32 .endm -.macro END2x2 AREG, BREG, OffsetA, OffsetB +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs38, vs1, vs18 - xvmaddadp vs39, vs1, vs19 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 .endm -.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_2 + /*for load2 offset will be 64 and 64*/ + KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 .endm + + -.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xxswapd vs23, vs22 +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 xvmaddadp vs37, vs0, vs19 - + xxswapd vs21, vs20 + xxswapd vs23, vs22 xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 xvmaddadp vs39, vs1, vs19 - -.if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif - xvmaddadp vs32, vs8, vs20 + xvmaddadp vs36, vs8, vs22 xvmaddadp vs33, vs8, vs21 - -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs36, vs8, vs22 xvmaddadp vs37, vs8, vs23 - +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs38, vs9, vs22 + xvmaddadp vs35, vs9, vs21 xvmaddadp vs39, vs9, vs23 +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .endm + + .macro KERNEL2x2 - LOAD2x2 0 + LOAD2x2 END2x2 AO, BO, 32,32 .endm + + .macro SAVE2x2 add T1, CO ,LDC SAVE2 vs32,vs33,vs34,vs35,CO,0 SAVE2 vs36,vs37,vs38,vs39,T1,0 addi CO, CO, 32 .endm - /********************************************************************************************** -* Macros for N=2 and M=1 +* + +.macros for N=2 and M=1 **********************************************************************************************/ + + .macro Zero2x1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 + .endm -.macro LOAD2x1 Zero - lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.if \Zero==1 - Zero2x1 -.endif + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A .endm + .macro END2x1_NORMAL END2x1 AO,BO,16,32 .endm -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif - +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.endm - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs35, vs0, vs19 -.endm +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm -.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm -.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 .endm + -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - xxswapd vs21, vs20 - xxswapd vs23, vs22 -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs32, vs0, vs16 xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 xvmaddadp vs35, vs0, vs19 - -.if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif - +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif xvmaddadp vs32, vs8, vs20 + xvmaddadp vs34, vs8, vs22 xvmaddadp vs33, vs8, vs21 - - xvmaddadp vs34, vs8, vs22 xvmaddadp vs35, vs8, vs23 - +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .endm + + .macro KERNEL2x1 - LOAD2x1 0 + LOAD2x1 END2x1 AO, BO, 16,32 .endm + + .macro SAVE2x1 add T1, CO ,LDC SAVE1 vs32,vs33,CO,0 @@ -913,8 +1028,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm /********************************************************************************************** -* Macros for N=1 and M=8 +* + +.macros for N=1 and M=8 **********************************************************************************************/ + + .macro Zero1x8 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -932,167 +1051,228 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 .endm -.macro LOAD1x8 Zero - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A - -.if \Zero==1 - Zero1x8 -.endif +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + +.macro LOAD1x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + .endm + .macro END1x8_NORMAL END1x8 AO,BO,128,16 .endm -.macro END1x8 AREG, BREG, OffsetA, OffsetB +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 xvmaddadp vs47, vs7, vs17 .endm -.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x8_2 + /*for load2 offset will be 256 and 32*/ + KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 .endm + + -.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21, vs20 +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A .endif xvmaddadp vs40, vs4, vs16 xvmaddadp vs41, vs4, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif + xvmaddadp vs42, vs5, vs16 xvmaddadp vs43, vs5, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs6, vs16 xvmaddadp vs45, vs6, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif + xvmaddadp vs46, vs7, vs16 xvmaddadp vs47, vs7, vs17 - - +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A -.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A .endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 xvmaddadp vs39, vs11, vs21 - +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs12, vs20 xvmaddadp vs41, vs12, vs21 xvmaddadp vs42, vs13, vs20 xvmaddadp vs43, vs13, vs21 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs14, vs20 xvmaddadp vs45, vs14, vs21 xvmaddadp vs46, vs15, vs20 xvmaddadp vs47, vs15, vs21 - +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + + + .macro KERNEL1x8 - LOAD1x8 0 + LOAD1x8 END1x8 AO, BO, 128,16 .endm -.macro SAVE1x8 - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 +.macro SAVE1x8 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 addi CO, CO, 128 - .endm - /********************************************************************************************** -* Macros for N=1 and M=4 +* + +.macros for N=2 and M=4 **********************************************************************************************/ + .macro Zero1x4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -1104,324 +1284,542 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs39, vs39, vs39 .endm -.macro LOAD1x4 Zero - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17,vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm -.if \Zero==1 - Zero1x4 -.endif +.macro LOAD1x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + .endm + .macro END1x4_NORMAL END1x4 AO,BO,64,16 .endm -.macro END1x4 AREG, BREG, OffsetA, OffsetB +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 .endm -.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x4_2 + /*for load2 offset will be 128 and 32*/ + KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 .endm + + -.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21,vs20 +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs47, vs3, vs19 - -.if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif .if \Complete==0 - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs37, vs10, vs21 xvmaddadp vs38, vs11, vs20 xvmaddadp vs39, vs11, vs21 - - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs41, vs8, vs23 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs43, vs9, vs23 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs47, vs11, vs23 - +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + .macro KERNEL1x4 - LOAD1x4 0 + LOAD1x4 END1x4 AO, BO, 64,16 .endm -.macro SAVE1x4 + + +.macro SAVE1x4 SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 addi CO, CO, 64 - .endm - /********************************************************************************************** -* Macros for N=1 and M=2 +* + +.macros for N=2 and M=2 **********************************************************************************************/ + .macro Zero1x2 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 + xxlxor vs35, vs35, vs35 + .endm -.macro LOAD1x2 Zero - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17,vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - -.if \Zero==1 - Zero1x2 -.endif +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A .endm + .macro END1x2_NORMAL END1x2 AO,BO,32,16 .endm -.macro END1x2 AREG, BREG, OffsetA, OffsetB +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 .endm -.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x2_2 + /*for load2 offset will be 64 and 32*/ + KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 .endm + + -.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21,vs20 +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + .if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) .else - addi \AREG, \AREG, DISP4(\Index,64) + addi \AREG, \AREG, DISP4(\Index,64) addi \BREG, \BREG, DISP2(\Index,32) .endif -.endif - - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 - +.endif .endm + + .macro KERNEL1x2 - LOAD1x2 0 + LOAD1x2 END1x2 AO, BO, 32,16 .endm -.macro SAVE1x2 + + +.macro SAVE1x2 SAVE2 vs32,vs33,vs34,vs35,CO,0 addi CO, CO, 32 .endm - /********************************************************************************************** -* Macros for N=1 and M=1 +* + +.macros for N=2 and M=1 **********************************************************************************************/ + + .macro Zero1x1 xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 + xxlxor vs33, vs33, vs33 .endm -.macro LOAD1x1 Zero - lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17, vs16 -.if \Zero==1 - Zero1x1 -.endif - +.macro LOAD1x1 + LOAD1x1O 0,0 .endm + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs17, vs16 + +.endm + + .macro END1x1_NORMAL END1x1 AO,BO,16,16 .endm -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.endm - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm -.endm -.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 .endm + -.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21, vs20 - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm -.if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17, vs16 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif - .if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) .else - addi \AREG, \AREG, DISP2(\Index,32) + addi \AREG, \AREG, DISP2(\Index,32) addi \BREG, \BREG, DISP2(\Index,32) .endif -.endif - - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 - - +.endif .endm + + .macro KERNEL1x1 - LOAD1x1 0 + LOAD1x1 END1x1 AO, BO, 16,16 - .endm -.macro SAVE1x1 + + +.macro SAVE1x1 SAVE1 vs32,vs33,CO,0 addi CO, CO, 16 .endm +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/param.h b/param.h index 8f78a6a64..9a1a68ecd 100644 --- a/param.h +++ b/param.h @@ -2256,7 +2256,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 1025 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 1025 +#define ZGEMM_DEFAULT_Q 1026 #define SYMV_P 8 From 148c4cc5fd4db4d10dcda94c5640de12611b7669 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Wed, 5 Jun 2019 20:50:50 +0000 Subject: [PATCH 618/935] conflict resolve --- kernel/power/KERNEL.POWER9 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 126313c9a..0f91d6d7d 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o From 900d5a3205bd06c04990ff842449ec80808d5027 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jun 2019 10:18:40 +0200 Subject: [PATCH 619/935] Add gfortran workaround for ABI violations in LAPACKE for #2154 (see gcc bug 90329) --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index a95d6190f..49c02bbcb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -744,6 +744,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive +# work around ABI problem with passing single-character arguments +FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran From a0caa762b3066f28b1b9334932b39a3d377f79f9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Jun 2019 10:24:16 +0200 Subject: [PATCH 620/935] Add gfortran workaround for ABI violations for #2154 (see gcc bug 90329) --- Makefile.power | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.power b/Makefile.power index 195f1930f..24d8aa8a7 100644 --- a/Makefile.power +++ b/Makefile.power @@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas endif endif +# workaround for C->FORTRAN ABI violation in LAPACKE +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -fno-optimize-sibling-calls +endif FLAMEPATH = $(HOME)/flame/lib From 6ca898b63b81325559cbd2e925bf245f2a8ac999 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jun 2019 23:17:03 +0200 Subject: [PATCH 621/935] Add gfortran workaround for potential ABI violation for #2154 --- cmake/fc.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index adec28a91..9d8a5713c 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -44,7 +44,10 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") + # ensure reentrancy of lapack codes set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") + # work around ABI violation in passing string arguments from C + set(FCOMMON_OPT "$(FCOMMON_OPT) -fno-optimize-sibling-calls") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") From e674e1c73515fab38e263d121429a1a5da494a45 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jun 2019 09:31:13 +0200 Subject: [PATCH 622/935] Update fc.cmake --- cmake/fc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 9d8a5713c..f54c989d4 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -47,7 +47,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") # ensure reentrancy of lapack codes set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") # work around ABI violation in passing string arguments from C - set(FCOMMON_OPT "$(FCOMMON_OPT) -fno-optimize-sibling-calls") + set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") From 1f4b6a5d5d2c1e3aaf7ca1da6720825cd075391f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Jun 2019 09:50:13 +0200 Subject: [PATCH 623/935] Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds from #2143, -march=native precludes use of more specific options like -march=skylake-avx512 in individual kernels, and defeats the purpose of dynamic arch anyway. --- cmake/arch.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 470ea2a8f..b4547b7c9 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -73,7 +73,8 @@ if (DYNAMIC_ARCH) endif () if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) - endif () + string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) + endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () From 4ea794a52253ee56573922a15a64606ec82248a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Jun 2019 17:24:15 +0200 Subject: [PATCH 624/935] Avoid unintentional activation of TLS code via USE_TLS=0 fixes #2149 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 44eacda4b..c24647d62 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1070,7 +1070,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif -ifdef USE_TLS +ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif From d9ff2cd90df9a114701dcd6298ae8439d6648e04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jun 2019 23:01:35 +0200 Subject: [PATCH 625/935] Do not force gcc options on non-gcc compilers fixes compile failure with pgi 18.10 as reported on OpenBLAS-users --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 1b7fe3ef4..d23645058 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -28,11 +28,15 @@ endif ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mavx2 +endif +ifeq $(F_COMPILER), GFORTRAN) FCOMMON_OPT += -mavx2 endif endif endif +endif From 6d3efb2b5829d78926f818496de5572dbd34e64f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Jun 2019 08:08:11 +0200 Subject: [PATCH 626/935] Update Makefile.x86_64 --- Makefile.x86_64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index d23645058..99364752f 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -31,7 +31,7 @@ ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mavx2 endif -ifeq $(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -mavx2 endif endif From bbd4bb0154b6c4bfc561dce07b71eba7c7fa9013 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Jun 2019 15:04:10 +0200 Subject: [PATCH 627/935] Zero ecx with a mov instruction PGI assembler does not like the initialization in the constraints. --- common_x86_64.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index f59ff6627..9db66b545 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ *ecx=cpuinfo[2]; *edx=cpuinfo[3]; #else - __asm__ __volatile__("cpuid" + __asm__ __volatile__("mov %%ecx, 0;" + "cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op), "c"(0)); + : "0" (op)); #endif } From 280552b988e4377d95bc2f77bc07d2c00bb544e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Jun 2019 18:35:43 +0200 Subject: [PATCH 628/935] Fix mov syntax --- common_x86_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index 9db66b545..c05998d58 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -129,7 +129,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ *ecx=cpuinfo[2]; *edx=cpuinfo[3]; #else - __asm__ __volatile__("mov %%ecx, 0;" + __asm__ __volatile__("mov $0, %%ecx;" "cpuid" : "=a" (*eax), "=b" (*ebx), From cdbfb891da2a8de14aa1d9bd7a57265284f7432c Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Mon, 17 Jun 2019 15:33:38 +0000 Subject: [PATCH 629/935] new sgemm 8x16 --- kernel/power/sgemm_logic_power9.S | 193 ++++++++-------- kernel/power/sgemm_macros_power9.S | 338 +++++++++++++++-------------- param.h | 2 +- 3 files changed, 285 insertions(+), 248 deletions(-) diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 25e8c8387..053836cbf 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -3,89 +3,89 @@ b L8 MY_ALIGN LSGEMM_L8x16_LMAIN_SUB: - LOAD8x16_0 - mtctr L + LOAD8x16_2 MY_ALIGN LSGEMM_L8x16_LOOP: - - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_2 64,32, 15,0 - KERNEL8x16_I1_L4_2 64,32, 16,0 - KERNEL8x16_I1_L4_2 64,32, 17,0 - KERNEL8x16_I1_L4_2 64,32, 18,0 - KERNEL8x16_I1_L4_2 64,32, 19,0 - KERNEL8x16_I1_L4_2 64,32, 20,0 - KERNEL8x16_I1_L4_2 64,32, 21,0 - KERNEL8x16_I1_L4_2 64,32, 22,0 - KERNEL8x16_I1_L4_2 64,32, 23,0 - KERNEL8x16_I1_L4_2 64,32, 24,0 - KERNEL8x16_I1_L4_2 64,32, 25,0 - KERNEL8x16_I1_L4_2 64,32, 26,0 - KERNEL8x16_I1_L4_2 64,32, 27,0 - KERNEL8x16_I1_L4_2 64,32, 28,0 - KERNEL8x16_I1_L4_2 64,32, 29,0 - KERNEL8x16_I1_L4_2 64,32, 30,0 - KERNEL8x16_I1_L4_2 64,32, 31,1 + KERNEL8x16_L2 128,64,0,0 +LSGEMM_L8x16_K128: + KERNEL8x16_L2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64, 3,0 + KERNEL8x16_I1_L4_2 128,64, 4,0 + KERNEL8x16_I1_L4_2 128,64, 5,0 + KERNEL8x16_I1_L4_2 128,64, 6,0 + KERNEL8x16_I1_L4_2 128,64, 7,0 + KERNEL8x16_I1_L4_2 128,64, 8,0 + KERNEL8x16_I1_L4_2 128,64, 9,0 + KERNEL8x16_I1_L4_2 128,64, 10,0 + KERNEL8x16_I1_L4_2 128,64, 11,0 + KERNEL8x16_I1_L4_2 128,64, 12,0 + KERNEL8x16_I1_L4_2 128,64, 13,0 + KERNEL8x16_I1_L4_2 128,64, 14,0 + KERNEL8x16_I1_L4_2 128,64, 15,0 + KERNEL8x16_I1_L4_2 128,64, 16,0 + KERNEL8x16_I1_L4_2 128,64, 17,0 + KERNEL8x16_I1_L4_2 128,64, 18,0 + KERNEL8x16_I1_L4_2 128,64, 19,0 + KERNEL8x16_I1_L4_2 128,64, 20,0 + KERNEL8x16_I1_L4_2 128,64, 21,0 + KERNEL8x16_I1_L4_2 128,64, 22,0 + KERNEL8x16_I1_L4_2 128,64, 23,0 + KERNEL8x16_I1_L4_2 128,64, 24,0 + KERNEL8x16_I1_L4_2 128,64, 25,0 + KERNEL8x16_I1_L4_2 128,64, 26,0 + KERNEL8x16_I1_L4_2 128,64, 27,0 + KERNEL8x16_I1_L4_2 128,64, 28,0 + KERNEL8x16_I1_L4_2 128,64, 29,0 + KERNEL8x16_I1_L4_2 128,64, 30,0 + KERNEL8x16_I1_L4_2 128,64, 31,1 bdnz LSGEMM_L8x16_LOOP MY_ALIGN LSGEMM_L8x16_LOOP_END: - END8x16 0, AO, BO, 64, 32 + END8x16_2 blr MY_ALIGN LSGEMM_L8x16_L64_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_2 64,32, 7,0 - KERNEL8x16_I1_L4_2 64,32, 8,0 - KERNEL8x16_I1_L4_2 64,32, 9,0 - KERNEL8x16_I1_L4_2 64,32, 10,0 - KERNEL8x16_I1_L4_2 64,32, 11,0 - KERNEL8x16_I1_L4_2 64,32, 12,0 - KERNEL8x16_I1_L4_2 64,32, 13,0 - KERNEL8x16_I1_L4_2 64,32, 14,0 - KERNEL8x16_I1_L4_3 64,32, 15,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_2 128,64,7,0 + KERNEL8x16_I1_L4_2 128,64,8,0 + KERNEL8x16_I1_L4_2 128,64,9,0 + KERNEL8x16_I1_L4_2 128,64,10,0 + KERNEL8x16_I1_L4_2 128,64,11,0 + KERNEL8x16_I1_L4_2 128,64,12,0 + KERNEL8x16_I1_L4_2 128,64,13,0 + KERNEL8x16_I1_L4_2 128,64,14,0 + KERNEL8x16_I1_L4_3 128,64,15,1 blr LSGEMM_L8x16_L32_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_2 64,32, 3,0 - KERNEL8x16_I1_L4_2 64,32, 4,0 - KERNEL8x16_I1_L4_2 64,32, 5,0 - KERNEL8x16_I1_L4_2 64,32, 6,0 - KERNEL8x16_I1_L4_3 64,32, 7,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_3 128,64,7,1 blr LSGEMM_L8x16_L16_SUB: - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_2 64,32, 1,0 - KERNEL8x16_I1_L4_2 64,32, 2,0 - KERNEL8x16_I1_L4_3 64,32, 3,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_3 128,64,3,1 blr L8: @@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN: #if defined(TRMMKERNEL) REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 7 /**(T11-1) % 128x */ + addi T12,T12, -2 + srawi. L, T12, 7 /**(T11-2) % 128x */ #else mr T12, K - addi T12,T12, -1 - srawi. L, T12, 7 /**(K-1) % 128x */ + addi T12,T12, -2 + srawi. L, T12, 7 /**(K-2) % 128x */ #endif - ZERO8x16 + ZERO8x16 + mtctr L ble LSGEMM_L8x16_SUB0 bl LSGEMM_L8x16_LMAIN_SUB andi. L, T12, 127 @@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0: cmpwi T11,128 #else andi. L, K, 255 + cmpwi K,129 +#endif + li T10,1 + bne CMP8x16_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD8x16 64,32 + END8x16_WITHOUT_ADD + LOAD8x16_2O AO,BO, 128, 64 + mtctr T10 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE +CMP8x16_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T11,128 +#else cmpwi K,128 -#endif - - bne LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB2_128: - bl LSGEMM_L8x16_L64_SUB - bl LSGEMM_L8x16_L64_SUB - b LSGEMM_L8x16_SAVE +#endif + bne LSGEMM_L8x16_SUB2 + MY_ALIGN + mtctr T10 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD8x16_2O AO,BO, 128,64 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE MY_ALIGN LSGEMM_L8x16_SUB2: andi. T10,L,64 @@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16: LSGEMM_L8x16_SUB2_8: andi. T10,L, 8 ble LSGEMM_L8x16_SUB2_4 - LOAD8x16_0 - KERNEL8x16_I1_L4_2 64,32, 0,0 - KERNEL8x16_I1_L4_3 64,32, 1,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_3 128,64, 1,1 MY_ALIGN LSGEMM_L8x16_SUB2_4: andi. T10,L, 4 ble LSGEMM_L8x16_SUB2_2 - LOAD8x16_0 - KERNEL8x16_I1_L4_3 64,32, 0,1 + LOAD8x16_2 + KERNEL8x16_I1_L4_3 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_2: andi. T10,L, 2 ble LSGEMM_L8x16_SUB2_1 - LOAD8x16_0 - KERNEL8x16_I1_L2_3 64,32, 0,1 + LOAD8x16_2 + KERNEL8x16_E2 128,64, 0,1 MY_ALIGN LSGEMM_L8x16_SUB2_1: andi. T10,L, 1 diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index 3f86a1d25..2c9e537c7 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ -.macro LOAD8x16_1 - LOAD8x16 1 -.endm - -.macro LOAD8x16_0 - LOAD8x16 0 -.endm + .macro KERNEL8x16_L1_L4 Index,IsLast KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 @@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm -.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - + .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm @@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs63, vs63, vs63 .endm -.macro LOAD8x16 Zero +.macro LOAD8x16 OffsetA,OffsetB - lxv vs24, 0(BO) - lxv vs28, 16(BO) + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask - lxv vs0, 0(AO) - lxv vs1, 16(AO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 - lxv vs2, 32(AO) - lxv vs3, 48(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif .endm .macro END8x16_NORMAL END8x16 0, AO, BO, 64,32 .endm +.macro END8x16_WITHOUT_ADD + END8x16 0, AO,BO,0,0 +.endm + .macro END8x16 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 @@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 -KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete +KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete .endm .macro KERNEL8x16 First - LOAD8x16 0 + LOAD8x16 0,0 END8x16 \First, AO, BO, 64,32 .endm -.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs8, DISP16(\Index,\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.macro LOAD8x16_2 + LOAD8x16_2O AO,BO, 0,0 +.endm - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs52, vs0,vs29 +.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB + lxv vs8, (\OffsetB)(\BREG) + lxv vs12, (16+\OffsetB)(\BREG) + lxv vs24, (32+\OffsetB)(\BREG) + lxv vs28, (32+16+\OffsetB)(\BREG) + lxv vs4, (0+\OffsetA)(\AREG) + lxv vs5, (16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(\AREG) + lxv vs7, (48+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(\AREG) + lxv vs1, (64+16+\OffsetA)(\AREG) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(\AREG) + lxv vs3, (64+48+\OffsetA)(\AREG) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs60, vs0,vs31 +.macro END8x16_2 + /*for load2 offset will be 128 and 64*/ + KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 +.endm + - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 +.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs37, vs1,vs25 +.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs61, vs1,vs31 + +.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.if \Complete==0 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 .if \Complete==0 - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + +.if \Complete==0 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) .endif - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs62, vs2,vs31 - - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 .endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs36, vs4,vs9 .if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) .endif + + .if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \BREG, \BREG, DISP16(\Index,\OffsetB) + addi \AREG, \AREG, DISP32(\Index,\OffsetA) .else - addi \AREG, \AREG, DISP32(\Index,128) addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) .endif .endif - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs52, vs4,vs13 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs60, vs4,vs15 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs61, vs5,vs15 - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs51, vs7,vs12 - xvmaddasp vs55, vs7,vs13 - xvmaddasp vs59, vs7,vs14 - xvmaddasp vs63, vs7,vs15 - .endm diff --git a/param.h b/param.h index 9a1a68ecd..3934da6c8 100644 --- a/param.h +++ b/param.h @@ -2253,7 +2253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1025 +#define SGEMM_DEFAULT_Q 1026 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 1026 From a575f1e4c771b31ba29bd11af4a3190f240cf1d2 Mon Sep 17 00:00:00 2001 From: kavanabhat Date: Wed, 19 Jun 2019 15:27:14 +0530 Subject: [PATCH 630/935] Update dtrmm_kernel_16x4_power8.S --- kernel/power/dtrmm_kernel_16x4_power8.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 47e703a3a..57829ac51 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stvx v31, r11, r0 li r11,0 - stw r31, 144(SP) - stfd f1, ALPHA_SP stw r0, FZERO From 7684c4f8f8f979ec4d8a563e9b9cb442d9b04a80 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 20 Jun 2019 19:56:01 +0200 Subject: [PATCH 631/935] PGI compiler does not like -march=native --- Makefile.system | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 44eacda4b..fcb3cbe33 100644 --- a/Makefile.system +++ b/Makefile.system @@ -144,9 +144,10 @@ endif # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. ifeq ($(ARCH), x86_64) +ifneq ($(C_COMPILER), PGI) GETARCH_FLAGS += -march=native endif - +endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) From eebfeba7680e4b81f0803f44999c86303aa5945b Mon Sep 17 00:00:00 2001 From: Piotr Kubaj Date: Tue, 25 Jun 2019 10:58:56 +0200 Subject: [PATCH 632/935] Fix build on FreeBSD/powerpc64. Signed-off-by: Piotr Kubaj --- common_power.h | 6 +++--- kernel/power/axpy.S | 2 +- kernel/power/axpy_ppc440.S | 2 +- kernel/power/cgemm_kernel_8x4_power8.S | 6 +++--- kernel/power/ctrmm_kernel_8x4_power8.S | 6 +++--- kernel/power/dgemm_kernel_16x4_power8.S | 4 ++-- kernel/power/dtrmm_kernel_16x4_power8.S | 4 ++-- kernel/power/dtrsm_kernel_LT_16x4_power8.S | 4 ++-- kernel/power/gemm_beta.S | 2 +- kernel/power/gemm_kernel.S | 6 +++--- kernel/power/gemm_kernel_altivec.S | 2 +- kernel/power/gemm_kernel_altivec_cell.S | 2 +- kernel/power/gemm_kernel_altivec_g4.S | 2 +- kernel/power/gemm_kernel_cell.S | 6 +++--- kernel/power/gemm_kernel_g4.S | 4 ++-- kernel/power/gemm_kernel_hummer.S | 2 +- kernel/power/gemm_kernel_power3.S | 4 ++-- kernel/power/gemm_kernel_power6.S | 4 ++-- kernel/power/gemm_kernel_ppc440.S | 4 ++-- kernel/power/gemv_n.S | 4 ++-- kernel/power/gemv_n_ppc440.S | 4 ++-- kernel/power/gemv_t.S | 4 ++-- kernel/power/gemv_t_ppc440.S | 4 ++-- kernel/power/ger.S | 4 ++-- kernel/power/scal.S | 2 +- kernel/power/scal_ppc440.S | 2 +- kernel/power/sgemm_kernel_16x8_power8.S | 4 ++-- kernel/power/strmm_kernel_16x8_power8.S | 4 ++-- kernel/power/swap.S | 2 +- kernel/power/symv_L.S | 4 ++-- kernel/power/symv_U.S | 4 ++-- kernel/power/trsm_kernel_LN.S | 6 +++--- kernel/power/trsm_kernel_LT.S | 6 +++--- kernel/power/trsm_kernel_RT.S | 6 +++--- kernel/power/trsm_kernel_cell_LN.S | 6 +++--- kernel/power/trsm_kernel_cell_LT.S | 6 +++--- kernel/power/trsm_kernel_cell_RT.S | 6 +++--- kernel/power/trsm_kernel_hummer_LN.S | 2 +- kernel/power/trsm_kernel_hummer_LT.S | 2 +- kernel/power/trsm_kernel_hummer_RT.S | 2 +- kernel/power/trsm_kernel_power6_LN.S | 4 ++-- kernel/power/trsm_kernel_power6_LT.S | 4 ++-- kernel/power/trsm_kernel_power6_RT.S | 4 ++-- kernel/power/trsm_kernel_ppc440_LN.S | 4 ++-- kernel/power/trsm_kernel_ppc440_LT.S | 4 ++-- kernel/power/trsm_kernel_ppc440_RT.S | 4 ++-- kernel/power/zaxpy.S | 4 ++-- kernel/power/zaxpy_ppc440.S | 4 ++-- kernel/power/zgemm_beta.S | 2 +- kernel/power/zgemm_kernel.S | 8 ++++---- kernel/power/zgemm_kernel_8x2_power8.S | 6 +++--- kernel/power/zgemm_kernel_altivec.S | 6 +++--- kernel/power/zgemm_kernel_altivec_cell.S | 6 +++--- kernel/power/zgemm_kernel_altivec_g4.S | 4 ++-- kernel/power/zgemm_kernel_cell.S | 8 ++++---- kernel/power/zgemm_kernel_g4.S | 6 +++--- kernel/power/zgemm_kernel_hummer.S | 2 +- kernel/power/zgemm_kernel_power3.S | 6 +++--- kernel/power/zgemm_kernel_power6.S | 6 +++--- kernel/power/zgemm_kernel_power9.S | 4 ++-- kernel/power/zgemm_kernel_ppc440.S | 6 +++--- kernel/power/zgemv_n.S | 4 ++-- kernel/power/zgemv_n_ppc440.S | 4 ++-- kernel/power/zgemv_t.S | 4 ++-- kernel/power/zgemv_t_ppc440.S | 4 ++-- kernel/power/zger.S | 4 ++-- kernel/power/zscal.S | 2 +- kernel/power/zscal_ppc440.S | 2 +- kernel/power/zswap.S | 4 ++-- kernel/power/zsymv_L.S | 4 ++-- kernel/power/zsymv_U.S | 4 ++-- kernel/power/ztrmm_kernel_8x2_power8.S | 6 +++--- kernel/power/ztrsm_kernel_LN.S | 8 ++++---- kernel/power/ztrsm_kernel_LT.S | 8 ++++---- kernel/power/ztrsm_kernel_RT.S | 8 ++++---- kernel/power/ztrsm_kernel_cell_LN.S | 6 +++--- kernel/power/ztrsm_kernel_cell_LT.S | 8 ++++---- kernel/power/ztrsm_kernel_cell_RT.S | 6 +++--- kernel/power/ztrsm_kernel_hummer_LN.S | 2 +- kernel/power/ztrsm_kernel_hummer_LT.S | 2 +- kernel/power/ztrsm_kernel_hummer_RT.S | 2 +- kernel/power/ztrsm_kernel_power6_LN.S | 6 +++--- kernel/power/ztrsm_kernel_power6_LT.S | 6 +++--- kernel/power/ztrsm_kernel_power6_RT.S | 6 +++--- kernel/power/ztrsm_kernel_ppc440_LN.S | 6 +++--- kernel/power/ztrsm_kernel_ppc440_LT.S | 6 +++--- kernel/power/ztrsm_kernel_ppc440_RT.S | 6 +++--- 87 files changed, 193 insertions(+), 193 deletions(-) diff --git a/common_power.h b/common_power.h index 889205c75..f38b85864 100644 --- a/common_power.h +++ b/common_power.h @@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #ifndef __64BIT__ #define PROLOGUE \ .section .text;\ @@ -784,7 +784,7 @@ Lmcount$lazy_ptr: #define HALT mfspr r0, 1023 -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #if defined(PPC440) || defined(PPC440FP2) #undef MAX_CPU_NUMBER #define MAX_CPU_NUMBER 1 @@ -829,7 +829,7 @@ Lmcount$lazy_ptr: #define MAP_ANONYMOUS MAP_ANON #endif -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #ifndef __64BIT__ #define FRAMESLOT(X) (((X) * 4) + 8) #else diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S index fb9789da4..238771826 100644 --- a/kernel/power/axpy.S +++ b/kernel/power/axpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S index 81a660e4d..7733e46e7 100644 --- a/kernel/power/axpy_ppc440.S +++ b/kernel/power/axpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 8dbb6011d..2bc99974f 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index 26f49c663..822420dfd 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 41958eab0..651fd53fc 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ li r11,0 slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 57829ac51..84c65f503 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -269,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S index 7a4a30390..8a423f181 100644 --- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -217,7 +217,7 @@ li r11,0 #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 7acc05b4d..81457b698 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 16(SP) stw r0, 24(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S index e5e9ec346..37ff9c9e7 100644 --- a/kernel/power/gemm_kernel.S +++ b/kernel/power/gemm_kernel.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -186,7 +186,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -228,7 +228,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S index 6c7e78319..2dae49cb8 100644 --- a/kernel/power/gemm_kernel_altivec.S +++ b/kernel/power/gemm_kernel_altivec.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S index b7445a1f6..0823420dd 100644 --- a/kernel/power/gemm_kernel_altivec_cell.S +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S index 548150143..3a214b248 100644 --- a/kernel/power/gemm_kernel_altivec_g4.S +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S index f3d3b8325..26f9cb023 100644 --- a/kernel/power/gemm_kernel_cell.S +++ b/kernel/power/gemm_kernel_cell.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -192,7 +192,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -226,7 +226,7 @@ li PREC, 4 * SIZE #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S index 259f04c4e..a5c4d3a43 100644 --- a/kernel/power/gemm_kernel_g4.S +++ b/kernel/power/gemm_kernel_g4.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S index 3a8e1edfa..6ecbeb3e0 100644 --- a/kernel/power/gemm_kernel_hummer.S +++ b/kernel/power/gemm_kernel_hummer.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S index 4a6b5da62..f88bc291c 100644 --- a/kernel/power/gemm_kernel_power3.S +++ b/kernel/power/gemm_kernel_power3.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -187,7 +187,7 @@ li PREC, 4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index 1a412c4fb..b274f7655 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S index b128beb38..c5ef6e4e5 100644 --- a/kernel/power/gemm_kernel_ppc440.S +++ b/kernel/power/gemm_kernel_ppc440.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 02160bd61..abc61b62e 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -252,7 +252,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S index beb21200a..18d804520 100644 --- a/kernel/power/gemv_n_ppc440.S +++ b/kernel/power/gemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -199,7 +199,7 @@ stw r23, 180(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 457753065..25a4dd01b 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -260,7 +260,7 @@ stw r29, 220(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S index 6e560db6c..7d12b07a4 100644 --- a/kernel/power/gemv_t_ppc440.S +++ b/kernel/power/gemv_t_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -190,7 +190,7 @@ stw r22, 192(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/ger.S b/kernel/power/ger.S index fd397ce8c..d83546b0d 100644 --- a/kernel/power/ger.S +++ b/kernel/power/ger.S @@ -47,7 +47,7 @@ #endif #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -224,7 +224,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/scal.S b/kernel/power/scal.S index 7c65d1234..19fdd32ab 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -43,7 +43,7 @@ #define XX r4 #define PREA r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S index ed148834d..d977b0b59 100644 --- a/kernel/power/scal_ppc440.S +++ b/kernel/power/scal_ppc440.S @@ -43,7 +43,7 @@ #define XX r4 #define PRE r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index c72b00cf6..3e6440af8 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, 2 #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index f9b8a0bb8..78e539231 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/swap.S b/kernel/power/swap.S index e862b17bb..c9c0f86b0 100644 --- a/kernel/power/swap.S +++ b/kernel/power/swap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index f7d768c50..a4ff703e2 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -248,7 +248,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index d8e082397..c3063e077 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define IS r4 @@ -247,7 +247,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S index 7983c573b..8319d5ed8 100644 --- a/kernel/power/trsm_kernel_LN.S +++ b/kernel/power/trsm_kernel_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -236,7 +236,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S index c561fd014..30f25e015 100644 --- a/kernel/power/trsm_kernel_LT.S +++ b/kernel/power/trsm_kernel_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S index 07b88402c..d39d3a6e2 100644 --- a/kernel/power/trsm_kernel_RT.S +++ b/kernel/power/trsm_kernel_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -254,7 +254,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S index 803530cbb..f656015a8 100644 --- a/kernel/power/trsm_kernel_cell_LN.S +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S index 105e7d43c..083af7289 100644 --- a/kernel/power/trsm_kernel_cell_LT.S +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S index a54a261cb..5a5b67e77 100644 --- a/kernel/power/trsm_kernel_cell_RT.S +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S index 109dacb8c..35ffab427 100644 --- a/kernel/power/trsm_kernel_hummer_LN.S +++ b/kernel/power/trsm_kernel_hummer_LN.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S index 1ad062a7c..f7a09dbd8 100644 --- a/kernel/power/trsm_kernel_hummer_LT.S +++ b/kernel/power/trsm_kernel_hummer_LT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S index 94b3c0c85..0e563e5cc 100644 --- a/kernel/power/trsm_kernel_hummer_RT.S +++ b/kernel/power/trsm_kernel_hummer_RT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S index 937a6761a..83594c772 100644 --- a/kernel/power/trsm_kernel_power6_LN.S +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S index 924f00ec0..54a8547b0 100644 --- a/kernel/power/trsm_kernel_power6_LT.S +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S index 40ee5e28d..b2b27613c 100644 --- a/kernel/power/trsm_kernel_power6_RT.S +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S index 6b7312101..a708a084d 100644 --- a/kernel/power/trsm_kernel_ppc440_LN.S +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S index 28b109b96..31f82de2c 100644 --- a/kernel/power/trsm_kernel_ppc440_LT.S +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -176,7 +176,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S index df80cd393..f5005403c 100644 --- a/kernel/power/trsm_kernel_ppc440_RT.S +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S index ac5b249bb..b001f42d1 100644 --- a/kernel/power/zaxpy.S +++ b/kernel/power/zaxpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -123,7 +123,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S index b5c604e91..848a0135f 100644 --- a/kernel/power/zaxpy_ppc440.S +++ b/kernel/power/zaxpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -112,7 +112,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 1f4c29210..57c3bed50 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 8(SP) stw r0, 16(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S index 8ec8b674a..ae8a93e89 100644 --- a/kernel/power/zgemm_kernel.S +++ b/kernel/power/zgemm_kernel.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -169,7 +169,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -190,7 +190,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 5526b91c9..dfe2d9dc6 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -132,7 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -296,7 +296,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S index 2b650cd02..2525a8e58 100644 --- a/kernel/power/zgemm_kernel_altivec.S +++ b/kernel/power/zgemm_kernel_altivec.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -264,7 +264,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S index 642d1f2e7..47a79064d 100644 --- a/kernel/power/zgemm_kernel_altivec_cell.S +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -244,7 +244,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -270,7 +270,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S index 0f7a6f9aa..c305270bd 100644 --- a/kernel/power/zgemm_kernel_altivec_g4.S +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S index 8fd6b0afb..3d179378b 100644 --- a/kernel/power/zgemm_kernel_cell.S +++ b/kernel/power/zgemm_kernel_cell.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -175,7 +175,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -196,7 +196,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -230,7 +230,7 @@ li PREA, 16 * 12 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S index bf6bf77e8..b92fb4225 100644 --- a/kernel/power/zgemm_kernel_g4.S +++ b/kernel/power/zgemm_kernel_g4.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -185,7 +185,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -206,7 +206,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S index 991a64373..5546dd2f6 100644 --- a/kernel/power/zgemm_kernel_hummer.S +++ b/kernel/power/zgemm_kernel_hummer.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S index 471d3b9ae..d14cb1cd9 100644 --- a/kernel/power/zgemm_kernel_power3.S +++ b/kernel/power/zgemm_kernel_power3.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -161,7 +161,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -202,7 +202,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 3c28649bc..9b47b9fc1 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -199,7 +199,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -220,7 +220,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index 813f270b8..d1e60da6c 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r0, FLINK_SAVE(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S index 748b69a0c..ba99a21c5 100644 --- a/kernel/power/zgemm_kernel_ppc440.S +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -182,7 +182,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -203,7 +203,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index f93439986..708f1318d 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -250,7 +250,7 @@ stw r22, 176(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S index 55dd2d84f..bd1148b65 100644 --- a/kernel/power/zgemv_n_ppc440.S +++ b/kernel/power/zgemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -223,7 +223,7 @@ stw r22, 176(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index 9c6f510c2..d82fab16a 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -226,7 +226,7 @@ stw r0, 4 + FZERO #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S index bfc039a0c..d7f3ee027 100644 --- a/kernel/power/zgemv_t_ppc440.S +++ b/kernel/power/zgemv_t_ppc440.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -179,7 +179,7 @@ stw r0, 4 + FZERO #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zger.S b/kernel/power/zger.S index a9a607815..73757d448 100644 --- a/kernel/power/zger.S +++ b/kernel/power/zger.S @@ -47,7 +47,7 @@ #endif #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -235,7 +235,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S index 2eb7b0df3..ae68ee672 100644 --- a/kernel/power/zscal.S +++ b/kernel/power/zscal.S @@ -43,7 +43,7 @@ #define XX r4 #define PREA r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S index d0e4c9bcf..55dd1b87b 100644 --- a/kernel/power/zscal_ppc440.S +++ b/kernel/power/zscal_ppc440.S @@ -43,7 +43,7 @@ #define XX r4 #define PRE r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S index 8befadca2..415164a2b 100644 --- a/kernel/power/zswap.S +++ b/kernel/power/zswap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -117,7 +117,7 @@ stfd f30, 128(SP) stfd f31, 136(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index b348e328f..9f00df072 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -259,7 +259,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index b631cbe35..fe97fde8b 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define IS r4 @@ -256,7 +256,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index c1415138c..684cbd6eb 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -280,7 +280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S index 87473b45d..3acd9562d 100644 --- a/kernel/power/ztrsm_kernel_LN.S +++ b/kernel/power/ztrsm_kernel_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -244,7 +244,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S index db0860124..2d4f31189 100644 --- a/kernel/power/ztrsm_kernel_LT.S +++ b/kernel/power/ztrsm_kernel_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S index c50ab86df..605363119 100644 --- a/kernel/power/ztrsm_kernel_RT.S +++ b/kernel/power/ztrsm_kernel_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S index 884a3e864..4798b5958 100644 --- a/kernel/power/ztrsm_kernel_cell_LN.S +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S index 388dfe3c2..654938a4d 100644 --- a/kernel/power/ztrsm_kernel_cell_LT.S +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -246,7 +246,7 @@ li PREA, 16 * 12 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S index 00b50fe04..e3fe84d00 100644 --- a/kernel/power/ztrsm_kernel_cell_RT.S +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S index bf3eafa45..042f4d476 100644 --- a/kernel/power/ztrsm_kernel_hummer_LN.S +++ b/kernel/power/ztrsm_kernel_hummer_LN.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S index 865c85f78..fc8a0bef8 100644 --- a/kernel/power/ztrsm_kernel_hummer_LT.S +++ b/kernel/power/ztrsm_kernel_hummer_LT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S index 99868f948..17e31ffa8 100644 --- a/kernel/power/ztrsm_kernel_hummer_RT.S +++ b/kernel/power/ztrsm_kernel_hummer_RT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S index 65b8077db..3c40f605a 100644 --- a/kernel/power/ztrsm_kernel_power6_LN.S +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S index c27170604..b2a92301d 100644 --- a/kernel/power/ztrsm_kernel_power6_LT.S +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S index ff0338cdc..cf37b5ca0 100644 --- a/kernel/power/ztrsm_kernel_power6_RT.S +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S index d33522456..f0be64d81 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LN.S +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S index a9e7b891f..d5ff1b57f 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LT.S +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S index 43f4b07cb..b77dd76d1 100644 --- a/kernel/power/ztrsm_kernel_ppc440_RT.S +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif From 5a4f1a21188a99d935482f1bda057d4ea42d34f4 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 28 Jun 2019 10:29:44 +0000 Subject: [PATCH 633/935] Fix build for PPC970 on FreeBSD pt. 1 FreeBSD needs DCBT_ARG=0 as well. --- common_power.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_power.h b/common_power.h index f38b85864..5e15b7554 100644 --- a/common_power.h +++ b/common_power.h @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) ) #define DCBT_ARG 0 #else #define DCBT_ARG 8 From 7c7505a7784a698ecfac453284080b5074a7b102 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 28 Jun 2019 10:31:45 +0000 Subject: [PATCH 634/935] Fix build for PPC970 on FreeBSD pt.2 FreeBSD needs those macros too. --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 9a1a68ecd..0f354f2bc 100644 --- a/param.h +++ b/param.h @@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#if defined(OS_LINUX) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_DARWIN) || defined(OS_FREEBSD) #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 From a97b301aaabbe4bdf99a4506cc6a007d707f8b14 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Tue, 18 Jun 2019 15:55:56 +0000 Subject: [PATCH 635/935] cgemm/ctrmm power9 --- kernel/power/KERNEL.POWER9 | 6 +- kernel/power/cgemm_kernel_power9.S | 293 +++ kernel/power/cgemm_logic_power9.S | 2816 ++++++++++++++++++++++++++ kernel/power/cgemm_macros_power9.S | 3019 ++++++++++++++++++++++++++++ kernel/power/zgemm_logic_power9.S | 2 +- param.h | 4 +- 6 files changed, 6134 insertions(+), 6 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power9.S create mode 100644 kernel/power/cgemm_logic_power9.S create mode 100644 kernel/power/cgemm_macros_power9.S diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 0f91d6d7d..31a5deeba 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -5,7 +5,7 @@ STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S -CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +CTRMMKERNEL = cgemm_kernel_power9.S ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S @@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMKERNEL = cgemm_kernel_power9.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = cgemm_tcopy_8_power8.S +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S new file mode 100644 index 000000000..4b5c2fa31 --- /dev/null +++ b/kernel/power/cgemm_kernel_power9.S @@ -0,0 +1,293 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs19 +#define alpha_i vs20 +#define save_permute_1 vs21 +#define permute_mask vs22 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S new file mode 100644 index 000000000..b4f937e90 --- /dev/null +++ b/kernel/power/cgemm_logic_power9.S @@ -0,0 +1,2816 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S new file mode 100644 index 000000000..a256e1a01 --- /dev/null +++ b/kernel/power/cgemm_macros_power9.S @@ -0,0 +1,3019 @@ + +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmulsp \VSOUT1,\VSINII, alpha_i + xvmulsp \VSOUT2,\VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubasp \VSOUT1,\VSINRR, alpha_r + xvmaddasp \VSOUT2,\VSINII, alpha_r +.endm + +/* macros for N=4 and M=8 +**********************************************************************************************/ + +.macro Zero4x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD4x8 + LOAD4x8O 0,0 +.endm + + +.macro LOAD4x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_NORMAL + END4x8 AO,BO,64,32 +.endm + + +.macro END4x8_WITHOUT_ADD + END4x8 AO,BO,0,0 +.endm + + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.endm + + +.macro LOAD4x8_2 + LOAD4x8_2O 0,0 +.endm + + +.macro LOAD4x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 +.endm + + +.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64,32 +.endm + + +.macro SAVE4x8 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + xxperm vs2,vs50,permute_mask + xxperm vs6,vs58,permute_mask + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + xxperm vs3,vs51,permute_mask + xxperm vs7,vs59,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 + xxperm vs10,vs54,permute_mask + xxperm vs14,vs62,permute_mask + AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 + xxperm vs11,vs55,permute_mask + xxperm vs15,vs63,permute_mask + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + #ifndef TRMMKERNEL + lxv vs32 , 0(T2) + lxv vs40 , 16(T2) +#endif + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs33 , 32(T2) + lxv vs41 , 48(T2) +#endif + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 +#ifndef TRMMKERNEL + lxv vs34 , 0(T3) + lxv vs42 , 16(T3) +#endif + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs35 , 32(T3) + lxv vs43 , 48(T3) +#endif + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + MULT_APLHA_PART1 vs48,vs56,vs0,vs1 + MULT_APLHA_PART1 vs49,vs57,vs2,vs3 + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + MULT_APLHA_PART1 vs50,vs58,vs4,vs5 + MULT_APLHA_PART1 vs51,vs59,vs6,vs7 + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + MULT_APLHA_PART2 vs48,vs56,vs0,vs1 + MULT_APLHA_PART2 vs49,vs57,vs2,vs3 + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + MULT_APLHA_PART2 vs50,vs58,vs4,vs5 + MULT_APLHA_PART2 vs51,vs59,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs8,vs9 + MULT_APLHA_PART1 vs53,vs61,vs10,vs11 + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + MULT_APLHA_PART1 vs54,vs62,vs12,vs13 + MULT_APLHA_PART1 vs55,vs63,vs14,vs15 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + MULT_APLHA_PART2 vs52,vs60,vs8,vs9 + MULT_APLHA_PART2 vs53,vs61,vs10,vs11 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + MULT_APLHA_PART2 vs54,vs62,vs12,vs13 + MULT_APLHA_PART2 vs55,vs63,vs14,vs15 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs32,vs32,vs1 + xvaddsp vs40,vs40,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs33,vs33,vs5 + xvaddsp vs41,vs41,vs7 + xvaddsp vs34,vs34,vs9 + xvaddsp vs42,vs42,vs11 + xvaddsp vs35,vs35,vs13 + xvaddsp vs43,vs43,vs15 +#else + xxpermdi vs32,vs8,vs0,2 + xxpermdi vs40,vs10,vs2,2 + xxpermdi vs33,vs12,vs4,2 + xxpermdi vs41,vs14,vs6,2 + xxpermdi vs34,vs0,vs8,2 + xxpermdi vs42,vs2,vs10,2 + xxpermdi vs35,vs4,vs12,2 + xxpermdi vs43,vs6,vs14,2 +#endif + stxv vs32 , 0(T2) + stxv vs40 , 16(T2) + stxv vs33 , 32(T2) + stxv vs41 , 48(T2) + stxv vs34 , 0(T3) + stxv vs42 , 16(T3) + stxv vs35 , 32(T3) + stxv vs43 , 48(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro Zero4x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endm + + +.macro LOAD4x4 + LOAD4x4O 0,0 +.endm + + +.macro LOAD4x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_NORMAL + END4x4 AO,BO,32,32 +.endm + + +.macro END4x4_WITHOUT_ADD + END4x4 AO,BO,0,0 +.endm + + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.endm + + +.macro LOAD4x4_2 + LOAD4x4_2O 0,0 +.endm + + +.macro LOAD4x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 +.endm + + +.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32,32 +.endm + + +.macro SAVE4x4 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + #ifndef TRMMKERNEL + lxv vs28 , 0(T2) + lxv vs29 , 16(T2) +#endif +#ifndef TRMMKERNEL + lxv vs30 , 0(T3) + lxv vs31 , 16(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs48,vs56,vs4,vs5 + MULT_APLHA_PART1 vs49,vs57,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs12,vs13 + MULT_APLHA_PART1 vs53,vs61,vs14,vs15 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs48,vs56,vs4,vs5 + MULT_APLHA_PART2 vs49,vs57,vs6,vs7 + MULT_APLHA_PART2 vs52,vs60,vs12,vs13 + MULT_APLHA_PART2 vs53,vs61,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 + xvaddsp vs28,vs28,vs5 + xvaddsp vs29,vs29,vs7 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 + xxpermdi vs28,vs12,vs4,2 + xxpermdi vs29,vs14,vs6,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + stxv vs28 , 0(T2) + stxv vs29 , 16(T2) + stxv vs30 , 0(T3) + stxv vs31 , 16(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD4x2 + LOAD4x2O 0,0 +.endm + + +.macro LOAD4x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_NORMAL + END4x2 AO,BO,16,32 +.endm + + +.macro END4x2_WITHOUT_ADD + END4x2 AO,BO,0,0 +.endm + + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD4x2_2 + LOAD4x2_2O 0,0 +.endm + + +.macro LOAD4x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16,32 +.endm + + +.macro SAVE4x2 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs10,vs2,0 + xxpermdi vs3,vs0,vs8,3 + xxpermdi vs11,vs2,vs10,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 + xvaddsp vs25,vs25,vs3 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs10,vs2,0 + xxpermdi vs25,vs0,vs8,3 + xxpermdi vs27,vs2,vs10,3 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 0(T1) + stxv vs26 , 0(T2) + stxv vs27 , 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD4x1 + LOAD4x1O 0,0 +.endm + + +.macro LOAD4x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END4x1_NORMAL + END4x1 AO,BO,8,32 +.endm + + +.macro END4x1_WITHOUT_ADD + END4x1 AO,BO,0,0 +.endm + + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD4x1_2 + LOAD4x1_2O 0,0 +.endm + + +.macro LOAD4x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) +.endm + + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 +.endm + + +.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8,32 +.endm + + +.macro SAVE4x1 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + xxspltd vs9,vs2,0 + xxspltd vs11,vs2,1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 + xvaddsp vs38,vs38,vs9 + xvaddsp vs39,vs39,vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 + xxspltd vs38,vs2,0 + xxspltd vs39,vs2,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + stxsd v6 , 0(T2) + stxsd v7 , 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,64,16 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 +.endm + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64,16 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,32,16 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 +.endm + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32,16 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs36, vs36, vs36 + xxlxor vs40, vs40, vs40 + xxlxor vs44, vs44, vs44 +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,16,16 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 +.endm + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs44, vs0,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16,16 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs8,vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs0,vs8,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs0,vs8,3 +#endif + stxv vs24 , 0(CO) + stxv vs26 , 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,8,16 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8,16 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,64,8 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 +.endm + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + + +.macro SAVE1x8 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 + xxperm vs4,vs5, vs28 + xxperm vs6,vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + xvaddsp vs26,vs26,vs4 + xvaddsp vs27,vs27,vs6 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) + stxv vs4 , 32(CO) + stxv vs6 , 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,32,8 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 +.endm + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + + +.macro SAVE1x4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,16,8 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs0, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 +.endm + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP4(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + stxv vs24 , 0(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,8,8 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs37,vs36 + xvmaddasp vs40, vs37,vs38 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 +.endm + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index,\OffsetB)(\BREG) + lxv vs4, DISP2(\Index,\OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP2(\Index,16) +.endif + +.endif +.endm + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33,vs32,vs32,2 + xxpermdi vs41,vs40,vs40,2 + xvaddsp vs32,vs32,vs33 + xvaddsp vs40,vs40,vs41 + + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs37,vs1 + MULT_APLHA_PART2 vs32,vs40,vs37,vs1 + +/* reconstruct r,i pairs*/ + xxperm vs37,vs1, vs28 + +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36,vs36,vs37 + stxsd v4 , 0(CO) +#else + +/* vs37 is v5 */ + stxsd v5 , 0(CO) +#endif + addi CO, CO, 8 +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index f902484a3..fe5d8ade2 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -1353,7 +1353,7 @@ ZGEMM_L1: ZGEMM_L1_BEGIN: /*----------------------------------------*/ mr CO, C - slwi T1, LDC , 1 + add T2,C,LDC mr AO, A add C, C, T1 diff --git a/param.h b/param.h index 3934da6c8..84e577acc 100644 --- a/param.h +++ b/param.h @@ -2250,12 +2250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 832 #define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 640 +#define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 1026 #define DGEMM_DEFAULT_Q 384 -#define CGEMM_DEFAULT_Q 640 +#define CGEMM_DEFAULT_Q 1026 #define ZGEMM_DEFAULT_Q 1026 #define SYMV_P 8 From 9086543f503f63d9107ce539650f28918b027015 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:29:47 +0200 Subject: [PATCH 636/935] Utest needs CBLAS but not necessarily FORTRAN --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50da721cd..d7d9c2fce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,7 +211,8 @@ if (USE_THREAD) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() -if (MSVC OR NOT NOFORTRAN) +#if (MSVC OR NOT NOFORTRAN) +if (NOT NO_CBLAS) # Broken without fortran on unix add_subdirectory(utest) endif() From ae9e8b131e27f65684cf4cb98e03b7df4b290142 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 14:30:33 +0200 Subject: [PATCH 637/935] Add mingw builds to Appveyor config --- appveyor.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 44a616aaa..2f9cc7b0b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,7 +35,14 @@ environment: DYNAMIC_ARCH: ON WITH_FORTRAN: no - COMPILER: cl - + - COMPILER: MinGW64-gcc-7.2.0-mingw + DYNAMIC_ARCH: OFF + WITH_FORTRAN: ignore + - COMPILER: MinGW64-gcc-7.2.0 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-5.3.0 + WITH_FORTRAN: ignore + install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force @@ -52,7 +59,14 @@ install: before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. @@ -64,3 +78,4 @@ test_script: - echo Running Test - cd utest - openblas_utest + From f69a0be712a9dccf5fcf433a734eb1371cb6189a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:02:39 +0200 Subject: [PATCH 638/935] Add getarch flags to disable AVX on x86 (and other small fixes to match Makefile behaviour) --- cmake/system.cmake | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7f3696286..1c2093efe 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,13 @@ if (X86_64) set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () +# On x86 no AVX support is available +if (X86 OR X86_64) +if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") +endif () +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") @@ -148,7 +155,9 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") - +if (DEFINED BINARY) + message(STATUS "Compiling a ${BINARY}-bit binary.") +endif () if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () @@ -165,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") +else () +set(NO_LAPACK 1) +set(NO_LAPACKE 1) endif () if (BINARY64) @@ -190,9 +202,14 @@ if (NEED_PIC) endif () if (DYNAMIC_ARCH) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") - if (DYNAMIC_OLDER) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + if (X86 OR X86_64 OR ARM64 OR PPC) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () + else () + unset (DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") endif () endif () From 04d671aae2b452a0bf63837c289f8948c35eb675 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:05:04 +0200 Subject: [PATCH 639/935] Make disabling DYNAMIC_ARCH on unsupported systems work needs to be unset in the cache for the change to have any effect --- cmake/arch.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index b4547b7c9..5a7434551 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -81,7 +81,8 @@ if (DYNAMIC_ARCH) endif () if (NOT DYNAMIC_CORE) - unset(DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") + unset(DYNAMIC_ARCH CACHE) endif () endif () From 8fb76134bc0711634b410fa20d6eb113f8893a04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jul 2019 15:07:15 +0200 Subject: [PATCH 640/935] Mingw32 needs leading underscore on object names (also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile) --- cmake/prebuild.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a67c44bf5..e508a46c2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -59,6 +59,9 @@ set(FU "") if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) set(FU "_") endif() +if(MINGW AND NOT MINGW64) + set(FU "_") +endif() set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") @@ -82,6 +85,11 @@ endif () # f_check if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") +else () + file(APPEND ${TARGET_CONF_TEMP} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") + set(BU "_") endif () # Cannot run getarch on target if we are cross-compiling From b89c781637503ec66117eb3b887a3755d42f0f46 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jul 2019 16:04:45 +0200 Subject: [PATCH 641/935] Fix surprising behaviour of NO_AFFINITY=0 --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..09a648e4a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1124,8 +1124,12 @@ endif endif ifdef NO_AFFINITY +ifeq ($(NO_AFFINITY), 0) +override undefine NO_AFFINITY +else CCOMMON_OPT += -DNO_AFFINITY endif +endif ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE From b89d9762a29ac84422ebb6092584831efd85d355 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Mon, 8 Jul 2019 17:13:21 -0500 Subject: [PATCH 642/935] Change install_name on osx to match linux --- Makefile | 1 + Makefile.install | 3 ++- exports/Makefile | 8 ++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 07b08439e..60f189ef2 100644 --- a/Makefile +++ b/Makefile @@ -109,6 +109,7 @@ endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll diff --git a/Makefile.install b/Makefile.install index fefecd98d..8070b4729 100644 --- a/Makefile.install +++ b/Makefile.install @@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" diff --git a/exports/Makefile b/exports/Makefile index b1348bd4a..d32e449df 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) +ifeq ($(OSNAME), Darwin) +INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib +endif + ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else @@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c From 0ba29fd2625dfe405a08005a22d0fa21293cc16c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:46:51 +0800 Subject: [PATCH 643/935] Update dgemm_kernel_4x8_haswell.S for zen2 replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128 --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 120 ++++++++++------------- 1 file changed, 54 insertions(+), 66 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c84b599ce..5416018bb 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 @@ -153,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -206,7 +206,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -222,7 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 @@ -232,7 +232,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -247,7 +247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO @@ -257,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -284,18 +284,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -324,18 +322,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -365,18 +361,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -687,7 +681,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -695,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -710,14 +704,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -729,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -737,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -750,7 +744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 @@ -758,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -770,7 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO @@ -778,7 +772,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -799,18 +793,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,18 +831,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1084,13 +1074,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1100,12 +1090,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1114,13 +1104,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1130,13 +1120,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm @@ -1145,13 +1135,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 + vpermilpd $ 0x05, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1165,18 +1155,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 7a9050d6817dd63e4b3cb641566b03f069be47a9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 00:55:06 +0800 Subject: [PATCH 644/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5416018bb..b98610524 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -292,8 +292,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -330,8 +330,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -369,8 +369,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -801,8 +801,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -839,8 +839,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vperm2f128 $ 0x01 , %ymm2, %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 From 182b06d6adb445d00066eff3b15da335ee1656bc Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 17:02:35 +0800 Subject: [PATCH 645/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 40 ++++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b98610524..814a1c350 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -317,10 +317,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -356,10 +356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -395,10 +395,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm @@ -826,10 +826,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -865,10 +865,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 1733f927e6b892610bda045538a42d495faa1af5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 21:27:41 +0800 Subject: [PATCH 646/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 814a1c350..b30ecccea 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -106,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define A_PR1 512 -#define B_PR1 512 +#define B_PR1 160 /******************************************************************************************* * Macro definitions From 211ab03b1402a3c39311b7ca769aaad736ca554c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 22:39:15 +0800 Subject: [PATCH 647/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index b30ecccea..3f7f9a98e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,23 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 + prefetcht0 128(%rsp) /*BUFFER 1*/ vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - + prefetcht0 192(%rsp) vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - + prefetcht0 256(%rsp) vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - + prefetcht0 320(%rsp) vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 From 8a074b39656636ebec5812532b486cf751231a3b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:47:30 +0800 Subject: [PATCH 648/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 42 +++++++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 3f7f9a98e..5242e3efe 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -267,24 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 - prefetcht0 128(%rsp) /*BUFFER 1*/ + prefetcht0 BUFFER1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - prefetcht0 192(%rsp) + prefetcht0 64 + BUFFER1 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - prefetcht0 256(%rsp) + prefetcht0 128 + BUFFER1 vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 - prefetcht0 320(%rsp) + prefetcht0 192 + BUFFER1 vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1606,6 +1606,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + addq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + leaq (CO1,LDC,2),CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) + subq LDC,CO1 + subq LDC,CO1 +.endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) @@ -1773,7 +1804,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - + + PREFETCHT0_C .L12_12a: KERNEL4x12_M1 From 9b04baeaeeaaaeba8c12e3fc2418ceaeca53ebb0 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 17 Jul 2019 23:50:03 +0800 Subject: [PATCH 649/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 5242e3efe..42692f33b 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -318,10 +318,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 56(CO1) - prefetcht0 56(CO1,LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -357,10 +357,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -396,10 +396,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm From 9c89757562f43af48645a6563161909321077646 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:47:58 +0800 Subject: [PATCH 650/935] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 29 +++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 42692f33b..e26bddea3 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,6 +1865,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L12_11 ALIGN_4 @@ -1872,6 +1880,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L12_20: // Test rest of M @@ -2102,7 +2115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - + PREFETCHT0_C .L13_13: test $1, %rax @@ -2147,6 +2160,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + salq $3, K + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B + sarq $3, K + decq I # i -- jne .L13_11 ALIGN_4 @@ -2154,6 +2175,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + + movq M, I + sarq $2, I + salq $7, I + subq I, B + .L13_20: // Test rest of M From 825777faab163326f38a0e6203ef1fb6fa8de6af Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 19 Jul 2019 23:58:24 +0800 Subject: [PATCH 651/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index e26bddea3..225af3673 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1865,12 +1865,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) - addq $128, B + addq $128, B /* increment */ sarq $3, K decq I # i -- @@ -1880,6 +1883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I @@ -2160,6 +2164,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* currently an increment of 128 byte is suitable */ salq $3, K prefetcht2 (B) prefetcht2 (B, K, 8) @@ -2175,7 +2182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - + /* recover the original value of pointer B */ movq M, I sarq $2, I salq $7, I From f49f8047acbea636eb2a3542f306803a1285793b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 14:33:37 +0800 Subject: [PATCH 652/935] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 50 ++++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 225af3673..6d1460bb2 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if B_PR1 >= 96 prefetcht0 128 + BUFFER1 +#endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 >= 160 prefetcht0 192 + BUFFER1 +#endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 - +#if B_PR1 >= 224 + prefetcht0 256 + BUFFER1 +#endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - +#if B_PR1 >= 288 + prefetcht0 320 + BUFFER1 +#endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - +#if B_PR1 >= 352 + prefetcht0 384 + BUFFER1 +#endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#if B_PR1 >= 416 + prefetcht0 448 + BUFFER1 +#endif leaq (CO1, LDC, 2), %rax +#if B_PR1 >= 480 + prefetcht0 512 + BUFFER1 +#endif #if !defined(TRMMKERNEL) @@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 32(B) prefetcht2 32(B, K, 8) prefetcht2 96(B) prefetcht2 96(B, K, 8) addq $128, B /* increment */ +#endif sarq $3, K decq I # i -- @@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ - /* recover the original value of pointer B */ + + /* recover the original value of pointer B after prefetch */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L12_20: @@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* here for the prefetch of next b source block */ /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ - /* currently an increment of 128 byte is suitable */ + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) addq $128, B +#endif sarq $3, K decq I # i -- @@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* recover the original value of pointer B */ movq M, I sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ salq $7, I +#endif subq I, B .L13_20: From 94db259e5b432a7f1769c1d61071b9dd727778db Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:04:41 +0800 Subject: [PATCH 653/935] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 45 ++++++++++-------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6d1460bb2..6a8619e32 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,35 +1622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C + prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) prefetcht0 24(CO1,LDC,4) prefetcht0 (CO1,LDC,8) prefetcht0 24(CO1,LDC,8) - addq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - leaq (CO1,LDC,2),CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) - subq LDC,CO1 - subq LDC,CO1 .endm /*******************************************************************************************/ @@ -1820,12 +1798,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - PREFETCHT0_C .L12_12a: - + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2133,9 +2118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13_12a: + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2145,7 +2139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - PREFETCHT0_C .L13_13: test $1, %rax From 9440fa607d146f1b91d70e35404f0d4abe50ffc5 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 20 Jul 2019 22:08:22 +0800 Subject: [PATCH 654/935] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 6a8619e32..c834239be 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1622,7 +1622,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro PREFETCHT0_C - prefetcht0 ALPHA prefetcht0 (CO1) prefetcht0 24(CO1) prefetcht0 (CO1,LDC,4) @@ -1799,6 +1798,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L12_12 .L12_12a: + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 @@ -2117,7 +2117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L13_12 .L13_12a: - + prefetcht0 ALPHA PREFETCHT0_C addq LDC,CO1 KERNEL4x12_M1 From 4801c6d36bd87421b08e60efa1b6e0217fd41672 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 00:47:45 +0800 Subject: [PATCH 655/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c834239be..26eea0acf 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1866,7 +1866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ @@ -2184,19 +2184,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 /* here for the prefetch of next b source block */ - /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ salq $3, K #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ prefetcht2 (B) prefetcht2 (B, K, 8) - addq $64, B + addq $64, B /* increment */ #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ prefetcht2 (B) prefetcht2 (B, K, 8) prefetcht2 64(B) prefetcht2 64(B, K, 8) - addq $128, B + addq $128, B /* increment */ #endif sarq $3, K From 95fb98f556adcbbccc5f42318c7c645ec1837e1a Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 21 Jul 2019 01:10:32 +0800 Subject: [PATCH 656/935] Update dgemm_kernel_4x8_haswell.S --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 26eea0acf..082e62a7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -279,43 +279,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 -#if B_PR1 >= 96 +#if B_PR1 > 32 prefetcht0 128 + BUFFER1 #endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 -#if B_PR1 >= 160 +#if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 -#if B_PR1 >= 224 +#if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 -#if B_PR1 >= 288 +#if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 -#if B_PR1 >= 352 +#if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#if B_PR1 >= 416 +#if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif leaq (CO1, LDC, 2), %rax -#if B_PR1 >= 480 +#if B_PR1 > 416 prefetcht0 512 + BUFFER1 #endif From 28e96458e5a4b2d8039ed16048a07892a7c960bf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Jul 2019 08:28:16 +0200 Subject: [PATCH 657/935] Replace vpermpd with vpermilpd to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180) --- kernel/x86_64/zdot_microk_haswell-2.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 9f2fc2c1d..4eade7bfd 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" @@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" From 3f6ab1582aca019cf5514aac3af98dcb66c9bbd6 Mon Sep 17 00:00:00 2001 From: Tyler Reddy Date: Mon, 22 Jul 2019 21:24:57 -0600 Subject: [PATCH 658/935] MAINT: remove legacy CMake endif() * clean up a case where CMake endif() contained the conditional used in the if(), which is no longer needed / discouraged since our minimum required CMake version supports the modern syntax --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94d3ba643..610f689e0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX") EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) if(${OPERATING_SYSTEM} MATCHES "Android") set(HOST_OS ANDROID) - endif(${OPERATING_SYSTEM} MATCHES "Android") + endif() endif() From af2e7f28fce42e39fd3d4e108dfb4d55b377b5ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 23 Jul 2019 16:56:40 +0200 Subject: [PATCH 659/935] Override special make variables as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail. (Other subtargets appear to be unaffected as they do not use implicit make rules) --- utest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..5846db0bb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all From 30efed14d1aa9e1fba887aeddac964b841dd4720 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Jul 2019 15:26:09 +0200 Subject: [PATCH 660/935] Unset special make variables in ctest Makefile as well --- ctest/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 569a5dda3..f562c9bb3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,8 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +override TARGET_ARCH= +override TARGET_MACH= LIB = $(TOPDIR)/$(LIBNAME) From 7eecd8e39cfd3bf3f8eddc1154b8b2bfec19ea33 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 28 Jul 2019 07:39:09 +0800 Subject: [PATCH 661/935] Add files via upload --- kernel/x86_64/dgemm_kernel_4x8_haswell.S | 334 ++++++++++++++++++++++- 1 file changed, 325 insertions(+), 9 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 082e62a7c..19e32ef2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -107,6 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define A_PR1 512 #define B_PR1 160 +#define BROADCASTKERNEL /******************************************************************************************* * Macro definitions @@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1+64(BO) vmovups -8 * SIZE(BO), %ymm2 prefetcht0 B_PR1+128(BO) @@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_M1 prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_SUB vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -289,27 +369,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if B_PR1 > 96 prefetcht0 192 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif + #if B_PR1 > 160 prefetcht0 256 + BUFFER1 #endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif + #if B_PR1 > 224 prefetcht0 320 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + #if B_PR1 > 288 prefetcht0 384 + BUFFER1 #endif + +#ifndef BROADCASTKERNEL vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + #if B_PR1 > 352 prefetcht0 448 + BUFFER1 #endif @@ -338,11 +444,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rax) prefetcht1 56(%rax,LDC) - vpermilpd $ 0x05 , %ymm9 , %ymm9 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 @@ -353,7 +469,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -377,6 +493,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht1 56(%rbp) prefetcht1 56(%rbp,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm13, %ymm13 vpermilpd $ 0x05 , %ymm15, %ymm15 @@ -392,7 +518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp @@ -693,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_I vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -715,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -736,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -757,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -776,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -809,6 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -824,6 +1039,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax @@ -847,6 +1063,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 56(%rax) prefetcht0 56(%rax,LDC) +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm9 , %ymm9 vpermilpd $ 0x05 , %ymm11, %ymm11 @@ -862,7 +1088,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -1088,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1104,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1134,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1171,6 +1476,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vpermilpd $ 0x05 , %ymm5, %ymm5 vpermilpd $ 0x05 , %ymm7, %ymm7 @@ -1186,6 +1501,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax From 2dfb804cb943ac12035fe51859d109daca76b4f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Jul 2019 23:17:28 +0200 Subject: [PATCH 662/935] Replace vpermpd with vpermilpd in the Haswell DTRMM kernel to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186 --- kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c index 651736b89..2acdc4615 100644 --- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" @@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" @@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" - " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t" + " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" @@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" From 648491e1aa5cec7e8b8947d8ce47a825ceba705d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:51:09 +0200 Subject: [PATCH 663/935] Autodetect Intel Ice Lake (as SKYLAKEX target) --- cpuid_x86.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..141d6044e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,20 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { From 3d36c4511693bfd7c117465a701c5ff1f19f8565 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Aug 2019 22:52:35 +0200 Subject: [PATCH 664/935] Add CPUID identification of Intel Ice Lake --- driver/others/dynamic.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 045fc65b8..f1cd3c6e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){ } } return NULL; + case 7: + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; case 9: case 8: - if (model == 14 ) { // Kaby Lake + if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { From acf6002ab242f98460845bb71db8fefdbdb26a1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Aug 2019 12:40:13 +0200 Subject: [PATCH 665/935] Replace most vpermpd calls in the Haswell DTRSM_RN kernel --- kernel/x86_64/dtrsm_kernel_RN_haswell.c | 36 +++++++++++-------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index 9ab78fc8e..cb939e762 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "1: \n\t" " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" // was vpermpd 0xb1 " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" @@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" @@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" @@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" @@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" "3: \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" - " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + " vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" + " vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" From 4e2f81cfa1f6dfa24912c3ff88470471b39b695e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Aug 2019 23:15:35 +0200 Subject: [PATCH 666/935] Provide more information on mmap/munmap failure for #2207 --- driver/others/memory.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index f67cb01f4..77d2b72fa 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ +if (!release->address) return 0; + if (munmap(release -> address, BUFFER_SIZE)) { - printf("OpenBLAS : munmap failed\n"); + int errsv=errno; + perror("OpenBLAS : munmap failed:"); + printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); } } @@ -2073,6 +2077,12 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + } else { +#ifdef DEBUG + int errsv=errno; + perror("OpenBLAS : mmap failed:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); +#endif } #ifdef OS_LINUX From 1776ad82c01c0f9efeeda043eb02e10187084066 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Aug 2019 00:08:11 +0200 Subject: [PATCH 667/935] Add files via upload --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 77d2b72fa..534d6d9fc 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,7 +2041,7 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ -if (!release->address) return 0; +if (!release->address) return; if (munmap(release -> address, BUFFER_SIZE)) { int errsv=errno; From b7bbb02447ed612e380dc1ca6d6e7a26f48dc868 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 12:46:05 +0200 Subject: [PATCH 668/935] Silence two nuisance warnings from gcc --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a5e731d74..e8aa29813 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -94,7 +94,7 @@ int get_feature(char *search) if( p == NULL ) return 0; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { if (!strcmp(t, search)) { return(1); } } @@ -344,7 +344,7 @@ void get_features(void) if( p == NULL ) return; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { } From be147a9f28889d831019c6f860d501b2546e3771 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 16:24:39 +0200 Subject: [PATCH 669/935] Avoid adding a spurious dependency on the fortran runtime despite NOFORTRAN=1 for cases where a fortran compiler is present but not wanted (e.g. not fully functional) --- Makefile.system | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 16791bcc2..835c76e78 100644 --- a/Makefile.system +++ b/Makefile.system @@ -267,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv -# For detect fortran failed, only build BLAS. +# When fortran support was either not detected or actively deselected, only build BLAS. ifeq ($(NOFORTRAN), 1) NO_LAPACK = 1 +override FEXTRALIB = endif # From ebe2f47a0f0174928d798c95c3a9ec3f83cccbfa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:16:11 +0200 Subject: [PATCH 670/935] Set version to 0.3.7 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7d9c2fce..9c3b50fac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 7.dev) +set(OpenBLAS_PATCH_VERSION 7) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From d47fe78b0e5453dea32741e7cf14f2be10387ba8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:16:45 +0200 Subject: [PATCH 671/935] Set version to 0.3.7 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index a299588e0..ca506bb35 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.7.dev +VERSION = 0.3.7 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 5f36f18148603facb6c3540e673610d6b24cbfbb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:23:27 +0200 Subject: [PATCH 672/935] Update with 0.3.7 changes --- Changelog.txt | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..205ca02e2 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,47 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.7 +11-Aug 2019 + +common: + * having the gmake special variables TARGET_ARCH or TARGET_MACH + defined no longer causes build failures in ctest or utest + * defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer + has the same effect as setting them to 1 + * a new test program was added to allow checking the library for + thread safety + * a new option USE_LOCKING was added to ensure thread safety when + OpenBLAS itself is built without multithreading but will be + called from multiple threads. + * a build failure on Linux with glibc versions earlier than 2.5 + was fixed + * a runtime error with CPU enumeration (and NO_AFFINITY not set) + on glibc 2.6 was fixed + * NO_AFFINITY was added to the CMAKE options (and defaults to being + active on Linux, as in the gmake builds) + +x86_64: + * the build-time logic for detection of AVX512 availability in + the processor and compiler was fixed + * gmake builds on OSX now set the internal name of the library to + libopenblas.0.dylib (consistent with CMAKE) + * the Haswell DGEMM kernel received a significant speedup through + improved prefetch and load instructions + * performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly + increased by avoiding vpermpd instructions + * the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled + to fix remaining errors in DGEMM, DSYMM and DTRMM + +## POWER: + * added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970 + * added optimized kernels for POWER9 single and double precision complex BLAS3 + * added optimized kernels for POWER9 SGEMM and STRMM + +## ARMV7: + * fixed the softfp implementations of xAMAX and IxAMAX + * removed the predefined -march= flags on both ARMV5 and ARMV6 as + they were appropriate for only a subset of platforms + ==================================================================== Version 0.3.6 29-Apr-2019 From 7b6808b69ca706c724a4258e7cfb460b3c8c25a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:28:13 +0200 Subject: [PATCH 673/935] Increment version to 0.3.8.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7d9c2fce..74db77135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 7.dev) +set(OpenBLAS_PATCH_VERSION 8.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 02d92039811af88acd7e2b3d9fe4726c9f1008f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:28:47 +0200 Subject: [PATCH 674/935] Increment version to 0.3.8.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index a299588e0..c0941e488 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.7.dev +VERSION = 0.3.8.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 303869f5724bb86d722bc32f254a976625ea2046 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Aug 2019 23:31:36 +0200 Subject: [PATCH 675/935] Update with changes from 0.3.7 --- Changelog.txt | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..f160a4e13 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,46 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.7 +11-Aug 2019 + +common: + * having the gmake special variables TARGET_ARCH or TARGET_MACH + defined no longer causes build failures in ctest or utest + * defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer + has the same effect as setting them to 1 + * a new test program was added to allow checking the library for + thread safety + * a new option USE_LOCKING was added to ensure thread safety when + OpenBLAS itself is built without multithreading but will be + called from multiple threads. + * a build failure on Linux with glibc versions earlier than 2.5 + was fixed + * a runtime error with CPU enumeration (and NO_AFFINITY not set) + on glibc 2.6 was fixed + * NO_AFFINITY was added to the CMAKE options (and defaults to being + active on Linux, as in the gmake builds) + +x86_64: + * the build-time logic for detection of AVX512 availability in + the processor and compiler was fixed + * gmake builds on OSX now set the internal name of the library to + libopenblas.0.dylib (consistent with CMAKE) + * the Haswell DGEMM kernel received a significant speedup through + improved prefetch and load instructions + * performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly + increased by avoiding vpermpd instructions + * the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled + to fix remaining errors in DGEMM, DSYMM and DTRMM + +## POWER: + * added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970 + * added optimized kernels for POWER9 SGEMM and STRMM + +## ARMV7: + * fixed the softfp implementations of xAMAX and IxAMAX + * removed the predefined -march= flags on both ARMV5 and ARMV6 as + they were appropriate for only a subset of platforms + ==================================================================== Version 0.3.6 29-Apr-2019 From aef9804089b0c968806a0fc3cfb0219359ce42b2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Aug 2019 10:19:10 +0200 Subject: [PATCH 676/935] Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches. Fixes #2214 --- kernel/x86/lsame.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86/lsame.S b/kernel/x86/lsame.S index 3ac7a7314..2a2ab2bb5 100644 --- a/kernel/x86/lsame.S +++ b/kernel/x86/lsame.S @@ -56,13 +56,13 @@ #ifndef HAVE_CMOV movl %eax, %ecx subl $32, %ecx - jle .L1 + jl .L1 movl %ecx, %eax .L1: movl %edx, %ecx subl $32, %ecx - jle .L2 + jl .L2 movl %ecx, %edx .L2: subl %eax, %edx From a1fce677435a79d3cb577086793556d87ff76552 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Aug 2019 22:29:48 +0200 Subject: [PATCH 677/935] Make the new DGEMM regression test properly depend on CBLAS and LAPACKE fixes #2215 --- utest/CMakeLists.txt | 5 +++++ utest/Makefile | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 4e647cadc..1e3051a8f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,9 +38,14 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c + ) +if (NOT NO_CBLAS AND NOT NO_LAPACKE) +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} test_kernel_regress.c ) endif() +endif() set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..8c7e6b9f8 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all @@ -13,8 +16,12 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +ifneq ($(NO_CBLAS), 1) +ifneq ($(NO_LAPACKE), 1) OBJS += test_kernel_regress.o endif +endif +endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch From 9ef96b32a6cc9a41908e832f2f713462bb94f40f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2019 22:09:12 +0200 Subject: [PATCH 678/935] Add multithreading support to the x86_64 zdot kernel (#2222) * Add multithreading support copied from the ThunderX2T99 kernel. For #2221 --- kernel/x86_64/zdot.c | 86 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index ef12569c8..48f855b0e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -86,18 +86,26 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + + +static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,OPENBLAS_COMPLEX_FLOAT *result) { BLASLONG i; BLASLONG ix,iy; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; - + if ( n <= 0 ) { -// CREAL(result) = 0.0 ; -// CIMAG(result) = 0.0 ; - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); - return(result); + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); + *result=res; + return; } @@ -150,18 +158,68 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA } #if !defined(CONJ) - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); -// CREAL(result) = dot[0] - dot[1]; -// CIMAG(result) = dot[2] + dot[3]; + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); -// CREAL(result) = dot[0] + dot[1]; -// CIMAG(result) = dot[2] - dot[3]; + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); +#endif + *result=res; + return; +} +#if defined(SMP) +static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, +BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + zdot_compute(n, x, inc_x, y, inc_y, (void *)result); + return 0; +} #endif - return(result); +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + OPENBLAS_COMPLEX_FLOAT zdot; + CREAL(zdot) = 0.0; + CIMAG(zdot) = 0.0; -} +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + zdot_compute(n, x, inc_x, y, inc_y, &zdot); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + OPENBLAS_COMPLEX_FLOAT *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)zdot_thread_function, nthreads); + ptr = (OPENBLAS_COMPLEX_FLOAT *)result; + for (i = 0; i < nthreads; i++) { + CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); + CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); + ptr = (void *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + zdot_compute(n, x, inc_x, y, inc_y, &zdot); +#endif + + return zdot; +} From e3d846ab57eabadf5b933e8ca66d0b2c62e23e4c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Aug 2019 08:58:10 +0200 Subject: [PATCH 679/935] Do not use -march=native with the PGI compiler --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1c2093efe..4f8011603 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -66,7 +66,7 @@ if (DEFINED TARGET) endif () # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -if (X86_64) +if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI") set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () From a95a5e52b8df842f0ec23c6d0ad9b299c1318ab4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Aug 2019 09:00:11 +0200 Subject: [PATCH 680/935] Fix PGI compiler detection for getarch --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 6addbdad5..a54282f6c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -142,9 +142,9 @@ endif endif -# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(ARCH), x86_64) -ifneq ($(C_COMPILER), PGI) +ifeq ($(findstring pgcc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif From 6d8595351c8452f32dc015cda30cf1d4983d2447 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Aug 2019 14:19:21 +0200 Subject: [PATCH 681/935] Add Intel Goldmont Plus CPUID fixes #2227 --- cpuid_x86.c | 29 +- dynamic.c | 897 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 919 insertions(+), 7 deletions(-) create mode 100644 dynamic.c diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..8c954bf21 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,22 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 10: // Goldmont Plus + return CPUTYPE_NEHALEM; + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { diff --git a/dynamic.c b/dynamic.c new file mode 100644 index 000000000..aa2b87621 --- /dev/null +++ b/dynamic.c @@ -0,0 +1,897 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef _MSC_VER +#define strncasecmp _strnicmp +#define strcasecmp _stricmp +#endif + +#ifdef ARCH_X86 +#define EXTERN extern +#else +#define EXTERN +#endif + +#ifdef DYNAMIC_LIST +extern gotoblas_t gotoblas_PRESCOTT; + +#ifdef DYN_ATHLON +extern gotoblas_t gotoblas_ATHLON; +#else +#define gotoblas_ATHLON gotoblas_PRESCOTT +#endif +#ifdef DYN_KATMAI +extern gotoblas_t gotoblas_KATMAI; +#else +#define gotoblas_KATMAI gotoblas_PRESCOTT +#endif +#ifdef DYN_BANIAS +extern gotoblas_t gotoblas_BANIAS; +#else +#define gotoblas_BANIAS gotoblas_PRESCOTT +#endif +#ifdef DYN_COPPERMINE +extern gotoblas_t gotoblas_COPPERMINE; +#else +#define gotoblas_COPPERMINE gotoblas_PRESCOTT +#endif +#ifdef DYN_NORTHWOOD +extern gotoblas_t gotoblas_NORTHWOOD; +#else +#define gotoblas_NORTHWOOD gotoblas_PRESCOTT +#endif +#ifdef DYN_CORE2 +extern gotoblas_t gotoblas_CORE2; +#else +#define gotoblas_CORE2 gotoblas_PRESCOTT +#endif +#ifdef DYN_NEHALEM +extern gotoblas_t gotoblas_NEHALEM; +#else +#define gotoblas_NEHALEM gotoblas_PRESCOTT +#endif +#ifdef DYN_BARCELONA +extern gotoblas_t gotoblas_BARCELONA; +#elif defined(DYN_NEHALEM) +#define gotoblas_BARCELONA gotoblas_NEHALEM +#else +#define gotoblas_BARCELONA gotoblas_PRESCOTT +#endif +#ifdef DYN_ATOM +extern gotoblas_t gotoblas_ATOM; +elif defined(DYN_NEHALEM) +#define gotoblas_ATOM gotoblas_NEHALEM +#else +#define gotoblas_ATOM gotoblas_PRESCOTT +#endif +#ifdef DYN_NANO +extern gotoblas_t gotoblas_NANO; +#else +#define gotoblas_NANO gotoblas_PRESCOTT +#endif +#ifdef DYN_PENRYN +extern gotoblas_t gotoblas_PENRYN; +#else +#define gotoblas_PENRYN gotoblas_PRESCOTT +#endif +#ifdef DYN_DUNNINGTON +extern gotoblas_t gotoblas_DUNNINGTON; +#else +#define gotoblas_DUNNINGTON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON +extern gotoblas_t gotoblas_OPTERON; +#else +#define gotoblas_OPTERON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON_SSE3 +extern gotoblas_t gotoblas_OPTERON_SSE3; +#else +#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT +#endif +#ifdef DYN_BOBCAT +extern gotoblas_t gotoblas_BOBCAT; +#elif defined(DYN_NEHALEM) +#define gotoblas_BOBCAT gotoblas_NEHALEM +#else +#define gotoblas_BOBCAT gotoblas_PRESCOTT +#endif +#ifdef DYN_SANDYBRIDGE +extern gotoblas_t gotoblas_SANDYBRIDGE; +#elif defined(DYN_NEHALEM) +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#else +#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT +#endif +#ifdef DYN_BULLDOZER +extern gotoblas_t gotoblas_BULLDOZER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_BULLDOZER gotoblas_NEHALEM +#else +#define gotoblas_BULLDOZER gotoblas_PRESCOTT +#endif +#ifdef DYN_PILEDRIVER +extern gotoblas_t gotoblas_PILEDRIVER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_PILEDRIVER gotoblas_NEHALEM +#else +#define gotoblas_PILEDRIVER gotoblas_PRESCOTT +#endif +#ifdef DYN_STEAMROLLER +extern gotoblas_t gotoblas_STEAMROLLER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_STEAMROLLER gotoblas_NEHALEM +#else +#define gotoblas_STEAMROLLER gotoblas_PRESCOTT +#endif +#ifdef DYN_EXCAVATOR +extern gotoblas_t gotoblas_EXCAVATOR; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_EXCAVATOR gotoblas_NEHALEM +#else +#define gotoblas_EXCAVATOR gotoblas_PRESCOTT +#endif +#ifdef DYN_HASWELL +extern gotoblas_t gotoblas_HASWELL; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_HASWELL gotoblas_NEHALEM +#else +#define gotoblas_HASWELL gotoblas_PRESCOTT +#endif +#ifdef DYN_ZEN +extern gotoblas_t gotoblas_ZEN; +#elif defined(DYN_HASWELL) +#define gotoblas_ZEN gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_ZEN gotoblas_NEHALEM +#else +#define gotoblas_ZEN gotoblas_PRESCOTT +#endif +#ifdef DYN_SKYLAKEX +extern gotoblas_t gotoblas_SKYLAKEX; +#elif defined(DYN_HASWELL) +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SKYLAKEX gotoblas_NEHALEM +#else +#define gotoblas_SKYLAKEX gotoblas_PRESCOTT +#endif + + +#else // not DYNAMIC_LIST +EXTERN gotoblas_t gotoblas_KATMAI; +EXTERN gotoblas_t gotoblas_COPPERMINE; +EXTERN gotoblas_t gotoblas_NORTHWOOD; +EXTERN gotoblas_t gotoblas_BANIAS; +EXTERN gotoblas_t gotoblas_ATHLON; + +extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_BARCELONA; +#ifdef DYNAMIC_OLDER +extern gotoblas_t gotoblas_ATOM; +extern gotoblas_t gotoblas_NANO; +extern gotoblas_t gotoblas_PENRYN; +extern gotoblas_t gotoblas_DUNNINGTON; +extern gotoblas_t gotoblas_OPTERON; +extern gotoblas_t gotoblas_OPTERON_SSE3; +extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_ATOM gotoblas_NEHALEM +#define gotoblas_NANO gotoblas_NEHALEM +#define gotoblas_PENRYN gotoblas_CORE2 +#define gotoblas_DUNNINGTON gotoblas_CORE2 +#define gotoblas_OPTERON gotoblas_CORE2 +#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 +#define gotoblas_BOBCAT gotoblas_CORE2 +#endif + +#ifndef NO_AVX +extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BULLDOZER; +extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_STEAMROLLER; +extern gotoblas_t gotoblas_EXCAVATOR; +#ifdef NO_AVX2 +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#else +extern gotoblas_t gotoblas_HASWELL; +extern gotoblas_t gotoblas_ZEN; +#ifndef NO_AVX512 +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#endif +#endif +#else +//Use NEHALEM kernels for sandy bridge +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_HASWELL gotoblas_NEHALEM +#define gotoblas_SKYLAKEX gotoblas_NEHALEM +#define gotoblas_BULLDOZER gotoblas_BARCELONA +#define gotoblas_PILEDRIVER gotoblas_BARCELONA +#define gotoblas_STEAMROLLER gotoblas_BARCELONA +#define gotoblas_EXCAVATOR gotoblas_BARCELONA +#define gotoblas_ZEN gotoblas_BARCELONA +#endif + +#endif // DYNAMIC_LIST + +#define VENDOR_INTEL 1 +#define VENDOR_AMD 2 +#define VENDOR_CENTAUR 3 +#define VENDOR_HYGON 4 +#define VENDOR_UNKNOWN 99 + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#ifndef NO_AVX +static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv + __asm__ __volatile__ + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} +#endif + +int support_avx(){ +#ifndef NO_AVX + int eax, ebx, ecx, edx; + int ret=0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 6) == 6){ + ret=1; //OS support AVX + } + } + return ret; +#else + return 0; +#endif +} + +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx()) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx()) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 1){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + +extern void openblas_warning(int verbose, const char * msg); +#define FALLBACK_VERBOSE 1 +#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" +#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" +#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" + +static int get_vendor(void){ + int eax, ebx, ecx, edx; + + union + { + char vchar[16]; + int vint[4]; + } vendor; + + cpuid(0, &eax, &ebx, &ecx, &edx); + + *(&vendor.vint[0]) = ebx; + *(&vendor.vint[1]) = edx; + *(&vendor.vint[2]) = ecx; + + vendor.vchar[12] = '\0'; + + if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; + if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; + if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; + + if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +static gotoblas_t *get_coretype(void){ + + int eax, ebx, ecx, edx; + int family, exfamily, model, vendor, exmodel; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + family = BITMASK(eax, 8, 0x0f); + exfamily = BITMASK(eax, 20, 0xff); + model = BITMASK(eax, 4, 0x0f); + exmodel = BITMASK(eax, 16, 0x0f); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 0x6: + switch (exmodel) { + case 0: + if (model <= 0x7) return &gotoblas_KATMAI; + if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; + if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; + if (model == 14) return &gotoblas_BANIAS; + if (model == 15) return &gotoblas_CORE2; + return NULL; + + case 1: + if (model == 6) return &gotoblas_CORE2; + if (model == 7) return &gotoblas_PENRYN; + if (model == 13) return &gotoblas_DUNNINGTON; + if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; + if (model == 12) return &gotoblas_ATOM; + return NULL; + + case 2: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + if (model == 5) return &gotoblas_NEHALEM; + + //Intel Xeon Processor 5600 (Westmere-EP) + //Xeon Processor E7 (Westmere-EX) + //Xeon E7540 + if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; + + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + //Intel Core i7-3000 / Xeon E5 + if (model == 10 || model == 13) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + case 3: + //Intel Sandy Bridge 22nm (Ivy Bridge?) + if (model == 10 || model == 14) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Haswell + if (model == 12 || model == 15) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Broadwell + if (model == 13) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + if (model == 7) return &gotoblas_ATOM; //Bay Trail + return NULL; + case 4: + //Intel Haswell + if (model == 5 || model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Broadwell + if (model == 7 || model == 15) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Skylake + if (model == 14) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Braswell / Avoton + if (model == 12 || model == 13) { + return &gotoblas_NEHALEM; + } + return NULL; + case 5: + //Intel Broadwell + if (model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + if (model == 5) { + // Intel Skylake X + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + //Intel Skylake + if (model == 14) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Phi Knights Landing + if (model == 7) { + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Apollo Lake or Denverton + if (model == 12 || model == 15) { + return &gotoblas_NEHALEM; + } + return NULL; + case 6: + if (model == 6) { + // Cannon Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + case 7: + if (model == 10) // Goldmont plus + return &gotoblas_NEHALEM; + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + case 9: + case 8: + if (model == 14 ) { // Kaby Lake, Coffee Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + } + case 0xf: + if (model <= 0x2) return &gotoblas_NORTHWOOD; + return &gotoblas_PRESCOTT; + } + } + + if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ + if (family <= 0xe) { + // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if ( (eax & 0xffff) >= 0x01) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) + return NULL; + } + else + return NULL; + + return &gotoblas_ATHLON; + } + if (family == 0xf){ + if ((exfamily == 0) || (exfamily == 2)) { + if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; + else return &gotoblas_OPTERON; + } else if (exfamily == 5) { + return &gotoblas_BOBCAT; + } else if (exfamily == 6) { + if(model == 1){ + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return &gotoblas_BULLDOZER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 2 || model == 3){ + //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 + if(support_avx()) + return &gotoblas_PILEDRIVER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 5){ + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 0 || model == 8){ + if (exmodel == 1) { + //AMD Trinity + if(support_avx()) + return &gotoblas_PILEDRIVER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 3) { + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 6) { + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + + } + } + } else if (exfamily == 8) { + if (model == 1 || model == 8) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + } + } else if (exfamily == 9) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else { + return &gotoblas_BARCELONA; + } + } + } + + if (vendor == VENDOR_CENTAUR) { + switch (family) { + case 0x6: + return &gotoblas_NANO; + } + } + + return NULL; +} + +static char *corename[] = { + "Unknown", + "Katmai", + "Coppermine", + "Northwood", + "Prescott", + "Banias", + "Atom", + "Core2", + "Penryn", + "Dunnington", + "Nehalem", + "Athlon", + "Opteron", + "Opteron_SSE3", + "Barcelona", + "Nano", + "Sandybridge", + "Bobcat", + "Bulldozer", + "Piledriver", + "Haswell", + "Steamroller", + "Excavator", + "Zen", + "SkylakeX" +}; + +char *gotoblas_corename(void) { + + if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; + if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; + if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; + if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; + if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; + if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_CORE2) return corename[ 7]; + if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; + if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_NEHALEM) return corename[10]; + if (gotoblas == &gotoblas_ATHLON) return corename[11]; + if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; + if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_BARCELONA) return corename[14]; + if (gotoblas == &gotoblas_NANO) return corename[15]; + if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; + if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; + if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_HASWELL) return corename[20]; + if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; + if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; + if (gotoblas == &gotoblas_ZEN) return corename[23]; + if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; + return corename[0]; +} + + +static gotoblas_t *force_coretype(char *coretype){ + + int i ; + int found = -1; + char message[128]; + //char mname[20]; + + for ( i=1 ; i <= 24; i++) + { + if (!strncasecmp(coretype,corename[i],20)) + { + found = i; + break; + } + } + if (found < 0) + { + //strncpy(mname,coretype,20); + snprintf(message, 128, "Core not found: %s\n",coretype); + openblas_warning(1, message); + return(NULL); + } + + switch (found) + { + case 24: return (&gotoblas_SKYLAKEX); + case 23: return (&gotoblas_ZEN); + case 22: return (&gotoblas_EXCAVATOR); + case 21: return (&gotoblas_STEAMROLLER); + case 20: return (&gotoblas_HASWELL); + case 19: return (&gotoblas_PILEDRIVER); + case 18: return (&gotoblas_BULLDOZER); + case 17: return (&gotoblas_BOBCAT); + case 16: return (&gotoblas_SANDYBRIDGE); + case 15: return (&gotoblas_NANO); + case 14: return (&gotoblas_BARCELONA); + case 13: return (&gotoblas_OPTERON); + case 12: return (&gotoblas_OPTERON_SSE3); + case 11: return (&gotoblas_ATHLON); + case 10: return (&gotoblas_NEHALEM); + case 9: return (&gotoblas_DUNNINGTON); + case 8: return (&gotoblas_PENRYN); + case 7: return (&gotoblas_CORE2); + case 6: return (&gotoblas_ATOM); + case 5: return (&gotoblas_BANIAS); + case 4: return (&gotoblas_PRESCOTT); + case 3: return (&gotoblas_NORTHWOOD); + case 2: return (&gotoblas_COPPERMINE); + case 1: return (&gotoblas_KATMAI); + } + return(NULL); + +} + + + + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + +#ifdef ARCH_X86 + if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; +#else + if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ + if (sizeof(void*) == 8) { + if (gotoblas == &gotoblas_KATMAI || + gotoblas == &gotoblas_COPPERMINE || + gotoblas == &gotoblas_NORTHWOOD || + gotoblas == &gotoblas_BANIAS || + gotoblas == &gotoblas_ATHLON) + gotoblas = &gotoblas_PRESCOTT; + } +#endif + + if (gotoblas && gotoblas -> init) { + strncpy(coren,gotoblas_corename(),20); + sprintf(coremsg, "Core: %s\n",coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + + gotoblas = NULL; + +} From 3dc6b26eff770dc3a74bb370500f5fb99ce540d1 Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Tue, 20 Aug 2019 06:51:35 -0500 Subject: [PATCH 682/935] AIX changes for Power8 --- common_power.h | 29 + kernel/Makefile.L3 | 238 +++- kernel/power/casum_microk_power8.c | 10 +- kernel/power/ccopy_microk_power8.c | 10 +- kernel/power/cgemm_macros_8x4_power8.S | 768 +++++++++++ kernel/power/cgemm_tcopy_macros_8_power8.S | 96 ++ kernel/power/crot.c | 10 +- kernel/power/cswap_microk_power8.c | 6 +- kernel/power/ctrmm_macros_8x4_power8.S | 768 +++++++++++ kernel/power/dasum_microk_power8.c | 12 +- kernel/power/daxpy_microk_power8.c | 10 +- kernel/power/dcopy_microk_power8.c | 10 +- kernel/power/ddot_microk_power8.c | 12 +- kernel/power/dgemm_macros_16x4_power8.S | 976 ++++++++++++++ kernel/power/dgemm_ncopy_macros_4_power8.S | 120 ++ kernel/power/dgemm_tcopy_macros_16_power8.S | 120 ++ kernel/power/dgemv_n_microk_power8.c | 26 +- kernel/power/dgemv_t.c | 36 +- kernel/power/drot_microk_power8.c | 14 +- kernel/power/dscal_microk_power8.c | 18 +- kernel/power/dswap_microk_power8.c | 6 +- kernel/power/dtrmm_macros_16x4_power8.S | 960 ++++++++++++++ kernel/power/dtrsm_macros_LT_16x4_power8.S | 878 +++++++++---- kernel/power/idamax.c | 64 +- kernel/power/idamin.c | 64 +- kernel/power/izamax.c | 136 +- kernel/power/izamin.c | 136 +- kernel/power/lock.c | 4 +- kernel/power/sasum_microk_power8.c | 10 +- kernel/power/scopy_microk_power8.c | 10 +- kernel/power/sdot_microk_power8.c | 10 +- kernel/power/sgemm_macros_16x8_power8.S | 1296 +++++++++++++++++++ kernel/power/sgemm_tcopy_macros_16_power8.S | 120 ++ kernel/power/sgemm_tcopy_macros_8_power8.S | 96 ++ kernel/power/srot_microk_power8.c | 10 +- kernel/power/sscal_microk_power8.c | 16 +- kernel/power/sswap_microk_power8.c | 6 +- kernel/power/strmm_macros_16x8_power8.S | 1280 ++++++++++++++++++ kernel/power/zasum_microk_power8.c | 12 +- kernel/power/zaxpy_microk_power8.c | 46 +- kernel/power/zcopy_microk_power8.c | 10 +- kernel/power/zdot_microk_power8.c | 42 +- kernel/power/zgemm_macros_8x2_power8.S | 832 +++++++++--- kernel/power/zgemm_tcopy_macros_8_power8.S | 96 ++ kernel/power/zrot.c | 14 +- kernel/power/zscal_microk_power8.c | 46 +- kernel/power/zswap_microk_power8.c | 6 +- kernel/power/ztrmm_macros_8x2_power8.S | 782 +++++++++-- 48 files changed, 9272 insertions(+), 1005 deletions(-) diff --git a/common_power.h b/common_power.h index 889205c75..76b9f0f32 100644 --- a/common_power.h +++ b/common_power.h @@ -39,6 +39,35 @@ #ifndef COMMON_POWER #define COMMON_POWER +#define str(x) #x + +#ifdef OS_AIX +#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z +#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00 +#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11 +#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10 +#define XVMOVDP(T,A) xvcpsgndp T, A, A + +#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t" +#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t" +#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t" +#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t" + +#else +#define XXSPLTD(T,A,z) xxspltd T, A, z +#define XXMRGHD(T,A,B) xxmrghd T, A, B +#define XXMRGLD(T,A,B) xxmrgld T, A, B +#define XXSWAPD(T,A) xxswapd T, A +#define XVMOVDP(T,A) xvmovdp T, A + +#define XXSPLTD_S(T,A,z) "xxspltd T, A, z \n\t" +#define XXMRGHD_S(T,A,B) "xxmrghd T, A, B \n\t" +#define XXMRGLD_S(T,A,B) "xxmrgld T, A, B \n\t" +#define XXSWAPD_S(T,A) "xxswapd T, A" + +#endif + + #if defined(POWER8) || defined(POWER9) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f83def47b..ed8ae406f 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -57,8 +57,6 @@ USE_TRMM = 1 endif - - SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ @@ -436,7 +434,10 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s + m4 sgemmotcopy.s > sgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ + rm sgemmotcopy.s sgemmotcopy_nomacros.s ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -444,12 +445,17 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s + m4 sgemmitcopy.s > sgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ + rm sgemmitcopy.s sgemmitcopy_nomacros.s endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s + m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ + rm dgemm_ncopy.s dgemm_ncopy_nomacros.s $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -460,7 +466,10 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s + m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ + rm dgemm_itcopy.s dgemm_itcopy_nomacros.s endif @@ -485,10 +494,16 @@ endif endif $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) +# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s +# m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +# rm cgemm_oncopy.s cgemm_oncopy_nomacros.s $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) +# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_otcopy.s +# m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +# rm cgemm_otcopy.s cgemm_otcopy_nomacros.s ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) @@ -496,7 +511,10 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s + m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ + rm cgemm_itcopy.s cgemm_itcopy_nomacros.s endif @@ -512,7 +530,10 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s + m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ + rm zgemm_itcopy.s zgemm_itcopy_nomacros.s endif @@ -537,37 +558,67 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s + m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s + m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s + m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ + rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s + m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ + rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s + m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ + rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s + m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ + rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s + m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ + rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s + m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ + rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s + m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ + rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -584,28 +635,56 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s + m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ + rm strmmkernel_ln.s strmmkernel_ln_nomacros.s $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s + m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ + rm strmmkernel_lt.s strmmkernel_lt_nomacros.s $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s + m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ + rm strmmkernel_rn.s strmmkernel_rn_nomacros.s $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s + m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ + rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s + m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ + rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s + m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ + rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rt.s + m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ + rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -620,52 +699,100 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s + m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ + rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s + m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ + rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s + m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ + rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s + m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ + rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s + m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ + rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s + m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ + rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s + m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ + rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s + m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ + rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s + m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ + rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s + m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ + rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s + m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ + rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s + m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ + rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s + m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ + rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s + m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ + rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s + m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ + rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s + m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ + rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -677,7 +804,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -801,10 +931,16 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) +# $(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s +# m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ +# rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s + m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ + rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ @@ -1940,7 +2076,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) endif -$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) +$(D cgemm_kernel_r_nomacros.s + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ @@ -2083,7 +2222,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c index 7d12c9885..91d53ffc3 100644 --- a/kernel/power/casum_microk_power8.c +++ b/kernel/power/casum_microk_power8.c @@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" @@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x) "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c index 613c4d286..6a7886e6f 100644 --- a/kernel/power/ccopy_microk_power8.c +++ b/kernel/power/ccopy_microk_power8.c @@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S index 9a18cb189..46108bbb4 100644 --- a/kernel/power/cgemm_macros_8x4_power8.S +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -107,9 +111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -172,9 +184,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -237,9 +257,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -302,9 +330,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -344,9 +380,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -409,9 +453,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -474,9 +526,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -1546,14 +1606,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -1575,9 +1643,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1622,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1669,9 +1753,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1716,9 +1808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1742,9 +1842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1789,9 +1897,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1836,9 +1952,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -2388,14 +2512,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2416,9 +2548,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2454,9 +2594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2492,9 +2640,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2530,9 +2686,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2548,9 +2712,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2586,9 +2758,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2624,9 +2804,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -2916,14 +3104,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -2945,9 +3141,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -2992,9 +3196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3039,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3086,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3112,9 +3340,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3159,9 +3395,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3206,9 +3450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -3382,14 +3634,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -3406,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3446,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3486,9 +3762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3526,9 +3810,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3550,9 +3842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3590,9 +3890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3630,9 +3938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4170,14 +4486,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -4192,9 +4516,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4222,9 +4554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4252,9 +4592,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4282,9 +4630,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4298,9 +4654,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4328,9 +4692,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4358,9 +4730,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4638,14 +5018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4659,9 +5047,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4684,9 +5080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4709,9 +5113,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4734,9 +5146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4746,9 +5166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4771,9 +5199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4796,9 +5232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4946,14 +5390,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -4968,9 +5420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -4998,9 +5458,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5028,9 +5496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5058,9 +5534,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5074,9 +5558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5104,9 +5596,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5134,9 +5634,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5226,14 +5734,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -5247,9 +5763,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5275,9 +5799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5303,9 +5835,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5331,9 +5871,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5346,9 +5894,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5374,9 +5930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5402,9 +5966,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5676,14 +6248,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -5695,9 +6275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5717,9 +6305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5739,9 +6335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5761,9 +6365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5772,9 +6384,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5794,9 +6414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5816,9 +6444,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5960,14 +6596,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5978,9 +6622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5997,9 +6649,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6016,9 +6676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6035,18 +6703,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6063,9 +6747,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6082,9 +6774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -6161,14 +6861,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -6180,9 +6888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6202,9 +6918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6224,9 +6948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6246,9 +6978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6257,9 +6997,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6279,9 +7027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6301,9 +7057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -6351,5 +7115,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S index 03fda2766..64bf8dd99 100644 --- a/kernel/power/cgemm_tcopy_macros_8_power8.S +++ b/kernel/power/cgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 959a9eda0..2a5835546 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t" - "ble 2f \n\t" - ".p2align 5 \n\t" - "1: \n\t" + "ble two%= \n\t" + ".align 5 \n\t" + "one%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" @@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t" - "bgt 1b \n\t" - "2: \n\t" + "bgt one%= \n\t" + "two%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c index 8d7d0c0b9..829800230 100644 --- a/kernel/power/cswap_microk_power8.c +++ b/kernel/power/cswap_microk_power8.c @@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S index 48a21252c..922cab57a 100644 --- a/kernel/power/ctrmm_macros_8x4_power8.S +++ b/kernel/power/ctrmm_macros_8x4_power8.S @@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -113,9 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -184,9 +196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -255,9 +275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -326,9 +354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -368,9 +404,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -439,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -510,9 +562,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -1597,14 +1657,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1630,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1681,9 +1757,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1732,9 +1816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1783,9 +1875,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1809,9 +1909,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1860,9 +1968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1911,9 +2027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -2470,14 +2594,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2501,9 +2633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2542,9 +2682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2583,9 +2731,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2624,9 +2780,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2642,9 +2806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2683,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2724,9 +2904,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3019,14 +3207,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -3055,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3109,9 +3313,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3163,9 +3375,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3217,9 +3437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3243,9 +3471,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3297,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3351,9 +3595,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -3526,14 +3778,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3556,9 +3816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3602,9 +3870,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3648,9 +3924,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3694,9 +3978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3718,9 +4010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3764,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3810,9 +4118,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4357,14 +4673,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4383,9 +4707,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4417,9 +4749,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4451,9 +4791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4485,9 +4833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4501,9 +4857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4535,9 +4899,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4569,9 +4941,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4852,14 +5232,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4876,9 +5264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4904,9 +5300,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4932,9 +5336,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4960,9 +5372,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4972,9 +5392,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5000,9 +5428,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5028,9 +5464,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -5179,14 +5623,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -5205,9 +5657,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5239,9 +5699,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5273,9 +5741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5307,9 +5783,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5323,9 +5807,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5357,9 +5849,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5391,9 +5891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5482,14 +5990,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5514,9 +6030,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5553,9 +6077,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5592,9 +6124,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5631,9 +6171,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5646,9 +6194,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5685,9 +6241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5724,9 +6288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -6001,14 +6573,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6029,9 +6609,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6060,9 +6648,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6091,9 +6687,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6122,9 +6726,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6133,9 +6745,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6164,9 +6784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6195,9 +6823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -6340,14 +6976,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6366,9 +7010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6393,9 +7045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6420,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6447,18 +7115,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6483,9 +7167,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6510,9 +7202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -6589,14 +7289,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -6610,9 +7318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6634,9 +7350,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6658,9 +7382,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6682,9 +7414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6693,9 +7433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6717,9 +7465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6741,9 +7497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -6790,5 +7554,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c index 880d7d271..4652fc57c 100644 --- a/kernel/power/dasum_microk_power8.c +++ b/kernel/power/dasum_microk_power8.c @@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c index fb714a3f9..a92026e83 100644 --- a/kernel/power/daxpy_microk_power8.c +++ b/kernel/power/daxpy_microk_power8.c @@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) __asm__ ( - "xxspltd %x4, %x22, 0 \n\t" + XXSPLTD_S(%x4,%x22,0) "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" @@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" ".align 5 \n" - "1: \n\t" + "one%=: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" @@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c index 261dc04de..b51a21d08 100644 --- a/kernel/power/dcopy_microk_power8.c +++ b/kernel/power/dcopy_microk_power8.c @@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c index 4e6bc29c9..d2518ef7e 100644 --- a/kernel/power/ddot_microk_power8.c +++ b/kernel/power/ddot_microk_power8.c @@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" @@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 33, 41, 49 \n\t" @@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 5be517f7c..782425fbd 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -58,10 +62,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -125,11 +137,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -194,9 +214,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -260,9 +288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_L1', ` +#else .macro KERNEL4x16_L1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -326,9 +362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_L2', ` +#else .macro KERNEL4x16_L2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -392,10 +436,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -434,9 +486,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -495,9 +555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -555,9 +623,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif add T2, CO, LDC @@ -680,13 +756,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs39, o112, T4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -703,9 +787,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -744,9 +836,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -824,9 +932,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -849,9 +965,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -887,9 +1011,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -925,9 +1057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -1035,13 +1175,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1054,9 +1202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1082,9 +1238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1110,9 +1274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1138,9 +1310,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1155,9 +1335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1183,9 +1371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1211,9 +1407,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -1289,13 +1493,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvd2x vs0, 0, AO @@ -1307,9 +1519,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvd2x vs8, 0, AO @@ -1330,9 +1550,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvd2x vs8, 0, AO @@ -1353,9 +1581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvd2x vs0, 0, AO @@ -1376,9 +1612,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1389,9 +1633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -1412,9 +1664,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -1435,9 +1695,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -1497,13 +1765,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsdx vs0, 0, AO @@ -1515,9 +1791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsdx vs8, 0, AO @@ -1538,9 +1822,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsdx vs8, 0, AO @@ -1561,9 +1853,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsdx vs0, 0, AO @@ -1584,9 +1884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs8, vs28 @@ -1597,9 +1905,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -1620,9 +1936,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -1643,9 +1967,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -1705,13 +2037,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1731,9 +2071,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1772,9 +2120,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1813,9 +2169,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1854,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1877,9 +2249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1918,9 +2298,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1959,9 +2347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2055,13 +2451,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2074,9 +2478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2100,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2126,9 +2546,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2152,9 +2580,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2167,9 +2603,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2193,9 +2637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2219,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -2277,13 +2737,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2294,9 +2762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2314,9 +2790,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2334,9 +2818,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2354,9 +2846,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2365,9 +2865,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2385,9 +2893,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2405,9 +2921,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -2447,13 +2971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs0, 0, AO @@ -2463,9 +2995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, 0, AO @@ -2480,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, 0, AO @@ -2497,9 +3045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, 0, AO @@ -2514,18 +3070,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -2540,9 +3112,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -2557,9 +3137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -2591,13 +3179,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsdx vs0, 0, AO @@ -2607,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsdx vs8, 0, AO @@ -2624,9 +3228,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsdx vs8, 0, AO @@ -2641,9 +3253,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsdx vs0, 0, AO @@ -2658,18 +3278,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -2684,9 +3320,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -2701,9 +3345,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -2735,13 +3387,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2760,9 +3420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2791,9 +3459,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2822,9 +3498,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2853,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2867,9 +3559,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2898,9 +3598,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2929,9 +3637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2980,13 +3696,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2998,9 +3722,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3018,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3038,9 +3778,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3058,9 +3806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -3068,9 +3824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3088,9 +3852,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3108,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -3140,13 +3920,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3156,9 +3944,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3172,9 +3968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3188,9 +3992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3204,17 +4016,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3228,9 +4056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3244,9 +4080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -3268,13 +4112,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs0, 0, AO @@ -3283,9 +4135,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, 0, AO @@ -3297,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, 0, AO @@ -3311,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, 0, AO @@ -3325,16 +4201,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -3346,9 +4238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -3360,9 +4260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -3380,13 +4288,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsdx vs0, 0, AO @@ -3395,9 +4311,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsdx vs8, 0, AO @@ -3409,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsdx vs8, 0, AO @@ -3423,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsdx vs0, 0, AO @@ -3437,16 +4377,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -3458,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -3472,9 +4436,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3492,5 +4464,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S index 8d6744b91..33d02c77d 100644 --- a/kernel/power/dgemm_ncopy_macros_4_power8.S +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o0, A1 @@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 68e53bcf2..6c5b8ed62 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs35, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs33, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c index ae4fe9009..c2eb3968c 100644 --- a/kernel/power/dgemv_n_microk_power8.c +++ b/kernel/power/dgemv_n_microk_power8.c @@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y ( "lxvd2x 34, 0, %10 \n\t" // x0, x1 "lxvd2x 35, %11, %10 \n\t" // x2, x3 - "xxspltd 32, %x9, 0 \n\t" // alpha, alpha + XXSPLTD_S(32,%x9,0) // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) @@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda - "xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha - "xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha - "xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha - "xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda @@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %6, %6, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 @@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index b8589a131..ffe469d4d 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "li %[off],32 \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" //-------------------------------------------------- - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" "addi %[off2], %[off2],32 \n\t" @@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" "addi %[off2], %[off2],32 \n\t" @@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" #if defined(PREFETCH) @@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 33, %[x], %[off2] \n\t" "addic. %[n],%[n],-4 \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "addi %[off2], %[off2],32 \n\t" #if defined(PREFETCH) @@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do #if defined(PREFETCH) "dcbt %[temp],%[x] \n\t" #endif - "bgt+ 1b \n\t" - ".p2align 5 \n\t" - "2: \n\t" + "bgt+ one%= \n\t" + ".align 5 \n\t" + "two%=: \n\t" //-------------------------------------------- "xvmaddadp 34,36,32 \n\t" @@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "xvmaddadp 7,46,32 \n\t" "xvmaddadp 8,48,32 \n\t" "xvmaddadp 9,50,32 \n\t" - "xxspltd 36, %x[alpha], 0 \n\t" + XXSPLTD_S(36,%x[alpha],0) "xvmaddadp 34,37,33 \n\t" "xvmaddadp 35,39,33 \n\t" "xvmaddadp 4,41,33 \n\t" @@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do - "xxmrgld 42,34,35 \n\t" - "xxmrghd 43,34,35 \n\t" + XXMRGLD_S(42,34,35) + XXMRGHD_S(43,34,35) - "xxmrgld 44,4,5 \n\t" - "xxmrghd 45,4,5 \n\t" + XXMRGLD_S(44,4,5) + XXMRGHD_S(45,4,5) "xvadddp 42,42,43 \n\t" - "xxmrgld 46,6,7 \n\t" - "xxmrghd 47,6,7 \n\t" + XXMRGLD_S(46,6,7) + XXMRGHD_S(47,6,7) "xvadddp 44,44,45 \n\t" - "xxmrgld 48,8,9 \n\t" - "xxmrghd 49,8,9 \n\t" + XXMRGLD_S(48,8,9) + XXMRGHD_S(49,8,9) "xvadddp 46,46,47 \n\t" diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c index 016b7764d..259c08187 100644 --- a/kernel/power/drot_microk_power8.c +++ b/kernel/power/drot_microk_power8.c @@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) __asm__ ( - "xxspltd 36, %x13, 0 \n\t" // load c to both dwords - "xxspltd 37, %x14, 0 \n\t" // load s to both dwords + XXSPLTD_S(36,%x13,0) // load c to both dwords + XXSPLTD_S(37,%x14,0) // load s to both dwords "lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 33, %15, %3 \n\t" @@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) "addi %4, %4, 64 \n\t" "addic. %2, %2, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" @@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) "addi %4, %4, 128 \n\t" "addic. %2, %2, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c index 04898eb3d..e9bacd05a 100644 --- a/kernel/power/dscal_microk_power8.c +++ b/kernel/power/dscal_microk_power8.c @@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha) ( "dcbt 0, %2 \n\t" - "xxspltd %x3, %x3, 0 \n\t" + XXSPLTD_S(%x3,%x3,0) "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %4, %2 \n\t" @@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" @@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "addi %2, %2, 256 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" @@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x) ( "xxlxor %x3, %x3, %x3 \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" @@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c index 31eff3449..ecfd5c9f9 100644 --- a/kernel/power/dswap_microk_power8.c +++ b/kernel/power/dswap_microk_power8.c @@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S index 079144a90..efb034594 100644 --- a/kernel/power/dtrmm_macros_16x4_power8.S +++ b/kernel/power/dtrmm_macros_16x4_power8.S @@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -60,9 +64,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -127,9 +139,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -195,9 +215,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -262,9 +290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -303,9 +339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -364,9 +408,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -425,9 +477,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO addi T2, T1, 64 @@ -615,13 +675,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -638,9 +706,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -679,9 +755,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -719,9 +803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -759,9 +851,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -822,9 +930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -860,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -970,13 +1094,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -989,9 +1121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1017,9 +1157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1045,9 +1193,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1073,9 +1229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1090,9 +1254,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1118,9 +1290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1146,9 +1326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -1224,13 +1412,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvd2x vs0, 0, AO @@ -1242,9 +1438,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvd2x vs8, 0, AO @@ -1265,9 +1469,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvd2x vs8, 0, AO @@ -1288,9 +1500,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvd2x vs0, 0, AO @@ -1311,9 +1531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1324,9 +1552,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -1347,9 +1583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -1370,9 +1614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -1432,13 +1684,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsdx vs0, 0, AO @@ -1450,9 +1710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsdx vs8, 0, AO @@ -1473,9 +1741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsdx vs8, 0, AO @@ -1496,9 +1772,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsdx vs0, 0, AO @@ -1519,9 +1803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs8, vs28 @@ -1532,9 +1824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -1555,9 +1855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -1578,9 +1886,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -1640,13 +1956,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1666,9 +1990,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1707,9 +2039,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1748,9 +2088,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1789,9 +2137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1812,9 +2168,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1853,9 +2217,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1894,9 +2266,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO addi T2, T1, 64 @@ -1990,13 +2370,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2009,9 +2397,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2035,9 +2431,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2061,9 +2465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2087,9 +2499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2102,9 +2522,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2128,9 +2556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2154,9 +2590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -2212,13 +2656,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2229,9 +2681,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2249,9 +2709,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2269,9 +2737,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2289,9 +2765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2300,9 +2784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2320,9 +2812,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2340,9 +2840,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -2382,13 +2890,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs0, 0, AO @@ -2398,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, 0, AO @@ -2415,9 +2939,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, 0, AO @@ -2432,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, 0, AO @@ -2449,18 +2989,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -2475,9 +3031,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -2492,9 +3056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -2526,13 +3098,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsdx vs0, 0, AO @@ -2542,9 +3122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsdx vs8, 0, AO @@ -2559,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsdx vs8, 0, AO @@ -2576,9 +3172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsdx vs0, 0, AO @@ -2593,18 +3197,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -2619,9 +3239,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -2636,9 +3264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -2670,13 +3306,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2695,9 +3339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2726,9 +3378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2757,9 +3417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2788,9 +3456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2802,9 +3478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2833,9 +3517,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2864,9 +3556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2915,13 +3615,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2933,9 +3641,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2953,9 +3669,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2973,9 +3697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2993,9 +3725,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -3003,9 +3743,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3023,9 +3771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3043,9 +3799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -3075,13 +3839,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3091,9 +3863,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3107,9 +3887,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3123,9 +3911,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3139,17 +3935,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3163,9 +3975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3179,9 +3999,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -3203,13 +4031,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs0, 0, AO @@ -3218,9 +4054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, 0, AO @@ -3232,9 +4076,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, 0, AO @@ -3246,9 +4098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, 0, AO @@ -3260,16 +4120,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -3281,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -3295,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -3315,13 +4207,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsdx vs0, 0, AO @@ -3330,9 +4230,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsdx vs8, 0, AO @@ -3344,9 +4252,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsdx vs8, 0, AO @@ -3358,9 +4274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsdx vs0, 0, AO @@ -3372,16 +4296,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -3393,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -3407,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3427,5 +4383,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S index dc47daa3a..5a5c4037c 100644 --- a/kernel/power/dtrsm_macros_LT_16x4_power8.S +++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S @@ -1,46 +1,58 @@ +#if defined(_AIX) +define(`INIT_16x4', ` +#else .macro INIT_16x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 - xvmovdp vs48, vs0 - xvmovdp vs49, vs0 - xvmovdp vs50, vs0 - xvmovdp vs51, vs0 - xvmovdp vs52, vs0 - xvmovdp vs53, vs0 - xvmovdp vs54, vs0 - xvmovdp vs55, vs0 - xvmovdp vs56, vs0 - xvmovdp vs57, vs0 - xvmovdp vs58, vs0 - xvmovdp vs59, vs0 - xvmovdp vs60, vs0 - xvmovdp vs61, vs0 - xvmovdp vs62, vs0 - xvmovdp vs63, vs0 - + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) + XVMOVDP(vs48,vs0) + XVMOVDP(vs49,vs0) + XVMOVDP(vs50,vs0) + XVMOVDP(vs51,vs0) + XVMOVDP(vs52,vs0) + XVMOVDP(vs53,vs0) + XVMOVDP(vs54,vs0) + XVMOVDP(vs55,vs0) + XVMOVDP(vs56,vs0) + XVMOVDP(vs57,vs0) + XVMOVDP(vs58,vs0) + XVMOVDP(vs59,vs0) + XVMOVDP(vs60,vs0) + XVMOVDP(vs61,vs0) + XVMOVDP(vs62,vs0) + XVMOVDP(vs63,vs0) + +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x4', ` +#else .macro KERNEL_16x4 +#endif lxvd2x vs0, o0, AO @@ -98,35 +110,51 @@ xvmaddadp vs63, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x4', ` +#else .macro INIT_8x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 - + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) + +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x4', ` +#else .macro KERNEL_8x4 +#endif lxvd2x vs0, o0, AO @@ -161,27 +189,43 @@ xvmaddadp vs47, vs3, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x4', ` +#else .macro INIT_4x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x4', ` +#else .macro KERNEL_4x4 +#endif lxvd2x vs0, o0, AO @@ -206,23 +250,39 @@ xvmaddadp vs39, vs1, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x4', ` +#else .macro INIT_2x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x4', ` +#else .macro KERNEL_2x4 +#endif lxvd2x vs0, o0, AO @@ -242,23 +302,39 @@ xvmaddadp vs35, vs0, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x4', ` +#else .macro INIT_1x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x4', ` +#else .macro KERNEL_1x4 +#endif lxvdsx vs0, o0, AO @@ -278,14 +354,22 @@ xvmaddadp vs35, vs0, vs19 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x4', ` +#else .macro SOLVE_LT_16x4 +#endif //############### LOAD B ####################### @@ -1149,46 +1233,46 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs42, o8, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs44, o16, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs46, o24, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) addi T1, T1, 32 stxsdx vs48, o0, T1 - xxswapd vs48, vs48 + XXSWAPD(vs48,vs48) stxsdx vs50, o8, T1 - xxswapd vs50, vs50 + XXSWAPD(vs50,vs50) stxsdx vs52, o16, T1 - xxswapd vs52, vs52 + XXSWAPD(vs52,vs52) stxsdx vs54, o24, T1 - xxswapd vs54, vs54 + XXSWAPD(vs54,vs54) addi T1, T1, 32 stxsdx vs56, o0, T1 - xxswapd vs56, vs56 + XXSWAPD(vs56,vs56) stxsdx vs58, o8, T1 - xxswapd vs58, vs58 + XXSWAPD(vs58,vs58) stxsdx vs60, o16, T1 - xxswapd vs60, vs60 + XXSWAPD(vs60,vs60) stxsdx vs62, o24, T1 - xxswapd vs62, vs62 + XXSWAPD(vs62,vs62) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1225,46 +1309,46 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs41, o0, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs43, o8, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) stxsdx vs45, o16, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) addi T1, T1, 32 stxsdx vs49, o0, T1 - xxswapd vs49, vs49 + XXSWAPD(vs49,vs49) stxsdx vs51, o8, T1 - xxswapd vs51, vs51 + XXSWAPD(vs51,vs51) stxsdx vs53, o16, T1 - xxswapd vs53, vs53 + XXSWAPD(vs53,vs53) stxsdx vs55, o24, T1 - xxswapd vs55, vs55 + XXSWAPD(vs55,vs55) addi T1, T1, 32 stxsdx vs57, o0, T1 - xxswapd vs57, vs57 + XXSWAPD(vs57,vs57) stxsdx vs59, o8, T1 - xxswapd vs59, vs59 + XXSWAPD(vs59,vs59) stxsdx vs61, o16, T1 - xxswapd vs61, vs61 + XXSWAPD(vs61,vs61) stxsdx vs63, o24, T1 - xxswapd vs63, vs63 + XXSWAPD(vs63,vs63) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 @@ -1292,14 +1376,22 @@ stxsdx vs61, o16, T2 stxsdx vs63, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x4', ` +#else .macro SOLVE_LT_8x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1603,24 +1695,24 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs42, o8, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs44, o16, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs46, o24, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1643,24 +1735,24 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs41, o0, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs43, o8, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) stxsdx vs45, o16, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 @@ -1674,14 +1766,22 @@ stxsdx vs45, o16, T2 stxsdx vs47, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x4', ` +#else .macro SOLVE_LT_4x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1813,13 +1913,13 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1835,27 +1935,35 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 stxsdx vs37, o16, T2 stxsdx vs39, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x4', ` +#else .macro SOLVE_LT_2x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1925,9 +2033,9 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1941,21 +2049,29 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x4', ` +#else .macro SOLVE_LT_1x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -2001,7 +2117,7 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs32, o0, T2 @@ -2014,39 +2130,55 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs33, o0, T2 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_16x2', ` +#else .macro INIT_16x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 - + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) + +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x2', ` +#else .macro KERNEL_16x2 +#endif lxvd2x vs0, o0, AO @@ -2086,27 +2218,43 @@ xvmaddadp vs47, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x2', ` +#else .macro INIT_8x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x2', ` +#else .macro KERNEL_8x2 +#endif lxvd2x vs0, o0, AO @@ -2131,23 +2279,39 @@ xvmaddadp vs39, vs3, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x2', ` +#else .macro INIT_4x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x2', ` +#else .macro KERNEL_4x2 +#endif lxvd2x vs0, o0, AO @@ -2166,21 +2330,37 @@ xvmaddadp vs35, vs1, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x2', ` +#else .macro INIT_2x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x2', ` +#else .macro KERNEL_2x2 +#endif lxvd2x vs0, o0, AO @@ -2196,21 +2376,37 @@ xvmaddadp vs33, vs0, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x2', ` +#else .macro INIT_1x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x2', ` +#else .macro KERNEL_1x2 +#endif lxvdsx vs0, o0, AO @@ -2226,14 +2422,22 @@ xvmaddadp vs33, vs0, vs17 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x2', ` +#else .macro SOLVE_LT_16x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -2821,46 +3025,46 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) addi T1, T1, 32 stxsdx vs36, o0, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs37, o8, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs38, o16, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs41, o8, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs42, o16, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs43, o24, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) addi T1, T1, 32 stxsdx vs44, o0, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs45, o8, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs46, o16, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 @@ -2888,14 +3092,22 @@ stxsdx vs46, o16, T2 stxsdx vs47, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x2', ` +#else .macro SOLVE_LT_8x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3111,24 +3323,24 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) addi T1, T1, 32 stxsdx vs36, o0, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs37, o8, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs38, o16, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 @@ -3142,14 +3354,22 @@ stxsdx vs38, o16, T2 stxsdx vs39, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x2', ` +#else .macro SOLVE_LT_4x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3245,27 +3465,35 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 stxsdx vs34, o16, T2 stxsdx vs35, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x2', ` +#else .macro SOLVE_LT_2x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3322,21 +3550,29 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x2', ` +#else .macro SOLVE_LT_1x2 +#endif xxpermdi vs0, vs32, vs33, 0 @@ -3376,39 +3612,55 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs32, o0, T2 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_16x1', ` +#else .macro INIT_16x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 - + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) + +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x1', ` +#else .macro KERNEL_16x1 +#endif lxvdsx vs0, o0, AO @@ -3461,27 +3713,43 @@ xvmaddadp vs47, vs15, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x1', ` +#else .macro INIT_8x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x1', ` +#else .macro KERNEL_8x1 +#endif lxvdsx vs0, o0, AO @@ -3512,23 +3780,39 @@ xvmaddadp vs39, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x1', ` +#else .macro INIT_4x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x1', ` +#else .macro KERNEL_4x1 +#endif lxvdsx vs0, o0, AO @@ -3548,21 +3832,37 @@ xvmaddadp vs35, vs3, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x1', ` +#else .macro INIT_2x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x1', ` +#else .macro KERNEL_2x1 +#endif lxvdsx vs0, o0, AO @@ -3578,20 +3878,36 @@ xvmaddadp vs33, vs1, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x1', ` +#else .macro INIT_1x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 + XVMOVDP(vs32,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x1', ` +#else .macro KERNEL_1x1 +#endif lxvdsx vs0, o0, AO @@ -3605,31 +3921,39 @@ xvmaddadp vs32, vs0, vs16 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x1', ` +#else .macro SOLVE_LT_16x1 - - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 - xxswapd vs4, vs36 - xxswapd vs5, vs37 - xxswapd vs6, vs38 - xxswapd vs7, vs39 - xxswapd vs8, vs40 - xxswapd vs9, vs41 - xxswapd vs10, vs42 - xxswapd vs11, vs43 - xxswapd vs12, vs44 - xxswapd vs13, vs45 - xxswapd vs14, vs46 - xxswapd vs15, vs47 +#endif + + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) + XXSWAPD(vs4,vs36) + XXSWAPD(vs5,vs37) + XXSWAPD(vs6,vs38) + XXSWAPD(vs7,vs39) + XXSWAPD(vs8,vs40) + XXSWAPD(vs9,vs41) + XXSWAPD(vs10,vs42) + XXSWAPD(vs11,vs43) + XXSWAPD(vs12,vs44) + XXSWAPD(vs13,vs45) + XXSWAPD(vs14,vs46) + XXSWAPD(vs15,vs47) //############### LOAD B ####################### @@ -4215,23 +4539,31 @@ stxsdx vs46, o16, T1 stxsdx vs47, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x1', ` +#else .macro SOLVE_LT_8x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 - xxswapd vs4, vs36 - xxswapd vs5, vs37 - xxswapd vs6, vs38 - xxswapd vs7, vs39 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) + XXSWAPD(vs4,vs36) + XXSWAPD(vs5,vs37) + XXSWAPD(vs6,vs38) + XXSWAPD(vs7,vs39) //############### LOAD B ####################### @@ -4443,19 +4775,27 @@ stxsdx vs38, o16, T1 stxsdx vs39, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x1', ` +#else .macro SOLVE_LT_4x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) //############### LOAD B ####################### @@ -4546,17 +4886,25 @@ stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x1', ` +#else .macro SOLVE_LT_2x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) //############### LOAD B ####################### @@ -4609,16 +4957,24 @@ stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x1', ` +#else .macro SOLVE_LT_1x1 +#endif - xxswapd vs0, vs32 + XXSWAPD(vs0,vs32) //############### LOAD B ####################### @@ -4655,5 +5011,9 @@ stxsdx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 5bdc0a13c..623ac9fb0 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "xxlxor 39,39,39 \n\t" // vs39 vec_max_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) "xvabsdp 44, 44 \n\t" "xvabsdp 45, 45 \n\t" @@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" //=================================================================== - ".p2align 5 \n\t" + ".align 5 \n\t" - "1: \n\t" + "one%=: \n\t" "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 1,1,5 \n\t" // get real index for first bigger - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) "xvcmpgtdp 2, 3,39 \n\t" @@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //<-----------jump here from first load - "2: \n\t" + "two%=: \n\t" "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 1,1,5 \n\t" // get real index for first bigger - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" @@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -32 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" //============================================================================== "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { ///////extract max value and max index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4, 40,39 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 7fe0f8a33..b2705f2fa 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8, %[adder] \n\t" //{3,2} vs41 @@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) "xvabsdp 39, 39 \n\t" "xvabsdp 44, 44 \n\t" @@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" //=================================================================== - ".p2align 5 \n\t" + ".align 5 \n\t" - "1: \n\t" + "one%=: \n\t" "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 1,1,5 \n\t" // get real index for first smaller - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) "xvcmpgtdp 2,39, 3 \n\t" @@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //<-----------jump here from first load - "2: \n\t" + "two%=: \n\t" "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 1,1,5 \n\t" // get real index for first smaller - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" @@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -32 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" //============================================================================== "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { ///////extract min value and min index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index cfe78c8c0..339c3ccde 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) @@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //>>/////////////////////////////// half start - "2: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + "two%=: \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -16 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "xvcmpgtdp 2,1,0 \n\t " "xxsel 32,32,33,2 \n\t" @@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { ///////extract max value and max index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4, 40,39 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 1ffa3ba8b..6d0d15547 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) @@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //>>/////////////////////////////// half start - "2: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + "two%=: \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -16 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "xvcmpgtdp 2,0,1 \n\t " "xxsel 32,32,33,2 \n\t" @@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { ///////extract min value and min index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/lock.c b/kernel/power/lock.c index 51348d63c..1c1b006b0 100644 --- a/kernel/power/lock.c +++ b/kernel/power/lock.c @@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ " .machine \"any\" ;" "0: lwarx %0,0, %1 ;" " cmpwi 0,%0,0;" - " bne 1f;" + " bne one%=;" " stwcx. %2,0, %1 ;" " bne- 0b;" - "1: " + "one%=: " : "=&r"(ret) : "r"(address), "r" (val) : "cr0", "memory"); diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index 4bb515de8..aa465c38e 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" @@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x) "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 7a54d5e1e..da39789b1 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" @@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index bfe100c8b..a8db6a8d6 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddasp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" @@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 33, 41, 49 \n\t" diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 98414857f..9bcfca827 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x16_1', ` +#else .macro LOAD8x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_I1', ` +#else .macro KERNEL8x16_I1 +#endif lxvw4x vs4, o0, AO @@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_1', ` +#else .macro KERNEL8x16_1 +#endif lxvw4x vs4, o0, AO @@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_2', ` +#else .macro KERNEL8x16_2 +#endif lxvw4x vs0, o0, AO @@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_E2', ` +#else .macro KERNEL8x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUBI1', ` +#else .macro KERNEL8x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUB1', ` +#else .macro KERNEL8x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x16', ` +#else .macro SAVE8x16 +#endif mr T1, CO @@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x8_1', ` +#else .macro LOAD8x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_I1', ` +#else .macro KERNEL8x8_I1 +#endif lxvw4x vs4, o0, AO @@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_1', ` +#else .macro KERNEL8x8_1 +#endif lxvw4x vs4, o0, AO @@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_2', ` +#else .macro KERNEL8x8_2 +#endif lxvw4x vs0, o0, AO @@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_E2', ` +#else .macro KERNEL8x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUBI1', ` +#else .macro KERNEL8x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUB1', ` +#else .macro KERNEL8x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x8', ` +#else .macro SAVE8x8 +#endif mr T1, CO @@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x4_1', ` +#else .macro LOAD8x4_1 +#endif lxvw4x vs0, o0, AO @@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_I1', ` +#else .macro KERNEL8x4_I1 +#endif lxvw4x vs4, o0, AO @@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_1', ` +#else .macro KERNEL8x4_1 +#endif lxvw4x vs4, o0, AO @@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_2', ` +#else .macro KERNEL8x4_2 +#endif lxvw4x vs0, o0, AO @@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_E2', ` +#else .macro KERNEL8x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUBI1', ` +#else .macro KERNEL8x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUB1', ` +#else .macro KERNEL8x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x4', ` +#else .macro SAVE8x4 +#endif mr T1, CO @@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x2_1', ` +#else .macro LOAD8x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -1633,9 +1829,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_I1', ` +#else .macro KERNEL8x2_I1 +#endif lxsspx vs4, o0, AO @@ -1686,9 +1890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_1', ` +#else .macro KERNEL8x2_1 +#endif lxsspx vs4, o0, AO @@ -1739,9 +1951,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_2', ` +#else .macro KERNEL8x2_2 +#endif lxsspx vs0, o0, AO @@ -1792,9 +2012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_E2', ` +#else .macro KERNEL8x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -1822,9 +2050,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUBI1', ` +#else .macro KERNEL8x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -1875,9 +2111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUB1', ` +#else .macro KERNEL8x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -1928,9 +2172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x2', ` +#else .macro SAVE8x2 +#endif mr T1, CO @@ -2103,14 +2355,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x1_1', ` +#else .macro LOAD8x1_1 +#endif lxsspx vs0, o0, AO @@ -2133,9 +2393,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_I1', ` +#else .macro KERNEL8x1_I1 +#endif lxsspx vs4, o0, AO @@ -2177,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_1', ` +#else .macro KERNEL8x1_1 +#endif lxsspx vs4, o0, AO @@ -2221,9 +2497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_2', ` +#else .macro KERNEL8x1_2 +#endif lxsspx vs0, o0, AO @@ -2265,9 +2549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_E2', ` +#else .macro KERNEL8x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -2287,9 +2579,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUBI1', ` +#else .macro KERNEL8x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -2331,9 +2631,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUB1', ` +#else .macro KERNEL8x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -2375,9 +2683,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x1', ` +#else .macro SAVE8x1 +#endif mr T1, CO @@ -2518,14 +2834,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2543,9 +2867,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif lxvw4x vs4, o0, AO @@ -2586,9 +2918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif lxvw4x vs4, o0, AO @@ -2629,9 +2969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif lxvw4x vs0, o0, AO @@ -2672,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -2698,9 +3054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -2741,9 +3105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -2784,9 +3156,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO @@ -2907,14 +3287,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2930,9 +3318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO @@ -2963,9 +3359,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO @@ -2996,9 +3400,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO @@ -3029,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3047,9 +3467,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3080,9 +3508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3113,9 +3549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -3204,14 +3648,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO @@ -3226,9 +3678,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO @@ -3254,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO @@ -3282,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO @@ -3310,9 +3786,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3324,9 +3808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3352,9 +3844,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3380,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -3455,14 +3963,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -3479,9 +3995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxsspx vs4, o0, AO @@ -3513,9 +4037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxsspx vs4, o0, AO @@ -3547,9 +4079,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxsspx vs0, o0, AO @@ -3581,9 +4121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3599,9 +4147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3633,9 +4189,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -3667,9 +4231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3758,14 +4330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO @@ -3781,9 +4361,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO @@ -3810,9 +4398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO @@ -3839,9 +4435,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO @@ -3868,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3882,9 +4494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3911,9 +4531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -3940,9 +4568,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -4015,14 +4651,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4038,9 +4682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvw4x vs4, o0, AO @@ -4069,9 +4721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvw4x vs4, o0, AO @@ -4100,9 +4760,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvw4x vs0, o0, AO @@ -4131,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4147,9 +4823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4178,9 +4862,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4209,9 +4901,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO @@ -4274,14 +4974,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4295,9 +5003,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO @@ -4320,9 +5036,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO @@ -4345,9 +5069,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO @@ -4370,9 +5102,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4382,9 +5122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4407,9 +5155,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4432,9 +5188,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4481,14 +5245,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO @@ -4501,9 +5273,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO @@ -4523,9 +5303,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO @@ -4545,9 +5333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO @@ -4567,9 +5363,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4577,9 +5381,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4599,9 +5411,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4621,9 +5441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4662,14 +5490,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -4684,9 +5520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxsspx vs4, o0, AO @@ -4710,9 +5554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxsspx vs4, o0, AO @@ -4736,9 +5588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxsspx vs0, o0, AO @@ -4762,9 +5622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4774,9 +5642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4800,9 +5676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -4826,9 +5710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -4875,14 +5767,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO @@ -4896,9 +5796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO @@ -4919,9 +5827,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO @@ -4942,9 +5858,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO @@ -4965,9 +5889,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4975,9 +5907,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4998,9 +5938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5021,9 +5969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5062,14 +6018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5084,9 +6048,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvw4x vs4, o0, AO @@ -5109,9 +6081,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvw4x vs4, o0, AO @@ -5134,9 +6114,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvw4x vs0, o0, AO @@ -5159,9 +6147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -5170,9 +6166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5195,9 +6199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5220,9 +6232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO @@ -5256,14 +6276,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5276,9 +6304,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO @@ -5297,9 +6333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO @@ -5318,9 +6362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO @@ -5339,18 +6391,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5369,9 +6437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5390,9 +6466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -5418,14 +6502,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO @@ -5437,9 +6529,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO @@ -5456,9 +6556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO @@ -5475,9 +6583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO @@ -5494,17 +6610,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5521,9 +6653,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5540,9 +6680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -5564,14 +6712,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -5585,9 +6741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxsspx vs4, o0, AO @@ -5607,9 +6771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxsspx vs4, o0, AO @@ -5629,9 +6801,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxsspx vs0, o0, AO @@ -5651,18 +6831,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5682,9 +6878,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -5704,9 +6908,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -5732,14 +6944,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO @@ -5752,9 +6972,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO @@ -5772,9 +7000,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO @@ -5792,9 +7028,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO @@ -5812,17 +7056,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5840,9 +7100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5860,9 +7128,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -5884,13 +7160,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`COPYB_4x8', ` +#else .macro COPYB_4x8 +#endif lxvw4x vs5, o0, BO @@ -5993,10 +7277,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs54, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`COPYB_1x8', ` +#else .macro COPYB_1x8 +#endif lxvw4x vs5, o0, BO @@ -6026,5 +7318,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs14, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index 53f9c8b82..ed592a604 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 @@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 @@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 @@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 @@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 @@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 @@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S index 1b71d5bb3..f80f095dc 100644 --- a/kernel/power/sgemm_tcopy_macros_8_power8.S +++ b/kernel/power/sgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 @@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 @@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 @@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 @@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 @@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 @@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 6eecb60a1..329a8cd06 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "addi %4, %4, 64 \n\t" "addic. %2, %2, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" @@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 058ff3399..88fba3166 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" @@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "addi %2, %2, 256 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" @@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x) ( "xxlxor %x3, %x3, %x3 \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" @@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index cfefdd6ef..a407018a8 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S index 27bc1e89c..6c016d6fa 100644 --- a/kernel/power/strmm_macros_16x8_power8.S +++ b/kernel/power/strmm_macros_16x8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x16_1', ` +#else .macro LOAD8x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_I1', ` +#else .macro KERNEL8x16_I1 +#endif lxvw4x vs4, o0, AO @@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_1', ` +#else .macro KERNEL8x16_1 +#endif lxvw4x vs4, o0, AO @@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_2', ` +#else .macro KERNEL8x16_2 +#endif lxvw4x vs0, o0, AO @@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_E2', ` +#else .macro KERNEL8x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUBI1', ` +#else .macro KERNEL8x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUB1', ` +#else .macro KERNEL8x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x16', ` +#else .macro SAVE8x16 +#endif mr T1, CO @@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x8_1', ` +#else .macro LOAD8x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_I1', ` +#else .macro KERNEL8x8_I1 +#endif lxvw4x vs4, o0, AO @@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_1', ` +#else .macro KERNEL8x8_1 +#endif lxvw4x vs4, o0, AO @@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_2', ` +#else .macro KERNEL8x8_2 +#endif lxvw4x vs0, o0, AO @@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_E2', ` +#else .macro KERNEL8x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUBI1', ` +#else .macro KERNEL8x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUB1', ` +#else .macro KERNEL8x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x8', ` +#else .macro SAVE8x8 +#endif mr T1, CO @@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x4_1', ` +#else .macro LOAD8x4_1 +#endif lxvw4x vs0, o0, AO @@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_I1', ` +#else .macro KERNEL8x4_I1 +#endif lxvw4x vs4, o0, AO @@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_1', ` +#else .macro KERNEL8x4_1 +#endif lxvw4x vs4, o0, AO @@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_2', ` +#else .macro KERNEL8x4_2 +#endif lxvw4x vs0, o0, AO @@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_E2', ` +#else .macro KERNEL8x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUBI1', ` +#else .macro KERNEL8x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUB1', ` +#else .macro KERNEL8x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x4', ` +#else .macro SAVE8x4 +#endif mr T1, CO @@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x2_1', ` +#else .macro LOAD8x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -1632,9 +1828,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_I1', ` +#else .macro KERNEL8x2_I1 +#endif lxsspx vs4, o0, AO @@ -1684,9 +1888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_1', ` +#else .macro KERNEL8x2_1 +#endif lxsspx vs4, o0, AO @@ -1736,9 +1948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_2', ` +#else .macro KERNEL8x2_2 +#endif lxsspx vs0, o0, AO @@ -1788,9 +2008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_E2', ` +#else .macro KERNEL8x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -1818,9 +2046,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUBI1', ` +#else .macro KERNEL8x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -1870,9 +2106,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUB1', ` +#else .macro KERNEL8x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -1922,9 +2166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x2', ` +#else .macro SAVE8x2 +#endif mr T1, CO @@ -2097,14 +2349,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x1_1', ` +#else .macro LOAD8x1_1 +#endif lxsspx vs0, o0, AO @@ -2126,9 +2386,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_I1', ` +#else .macro KERNEL8x1_I1 +#endif lxsspx vs4, o0, AO @@ -2169,9 +2437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_1', ` +#else .macro KERNEL8x1_1 +#endif lxsspx vs4, o0, AO @@ -2212,9 +2488,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_2', ` +#else .macro KERNEL8x1_2 +#endif lxsspx vs0, o0, AO @@ -2255,9 +2539,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_E2', ` +#else .macro KERNEL8x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -2277,9 +2569,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUBI1', ` +#else .macro KERNEL8x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -2320,9 +2620,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUB1', ` +#else .macro KERNEL8x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -2363,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x1', ` +#else .macro SAVE8x1 +#endif mr T1, CO @@ -2506,14 +2822,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2531,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif lxvw4x vs4, o0, AO @@ -2574,9 +2906,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif lxvw4x vs4, o0, AO @@ -2617,9 +2957,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif lxvw4x vs0, o0, AO @@ -2660,9 +3008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -2686,9 +3042,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -2729,9 +3093,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -2772,9 +3144,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO @@ -2895,14 +3275,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2918,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO @@ -2951,9 +3347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO @@ -2984,9 +3388,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO @@ -3017,9 +3429,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3035,9 +3455,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3068,9 +3496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3101,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -3192,14 +3636,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO @@ -3214,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO @@ -3242,9 +3702,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO @@ -3270,9 +3738,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO @@ -3298,9 +3774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3312,9 +3796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3340,9 +3832,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3368,9 +3868,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -3443,14 +3951,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -3466,9 +3982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxsspx vs4, o0, AO @@ -3499,9 +4023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxsspx vs4, o0, AO @@ -3532,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxsspx vs0, o0, AO @@ -3565,9 +4105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3583,9 +4131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3616,9 +4172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -3649,9 +4213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3740,14 +4312,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO @@ -3762,9 +4342,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO @@ -3790,9 +4378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO @@ -3818,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO @@ -3846,9 +4450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3860,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3888,9 +4508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -3916,9 +4544,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -3991,14 +4627,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4014,9 +4658,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvw4x vs4, o0, AO @@ -4045,9 +4697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvw4x vs4, o0, AO @@ -4076,9 +4736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvw4x vs0, o0, AO @@ -4107,9 +4775,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4123,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4154,9 +4838,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4185,9 +4877,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO @@ -4250,14 +4950,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4271,9 +4979,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO @@ -4296,9 +5012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO @@ -4321,9 +5045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO @@ -4346,9 +5078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4358,9 +5098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4383,9 +5131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4408,9 +5164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4457,14 +5221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO @@ -4477,9 +5249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO @@ -4499,9 +5279,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO @@ -4521,9 +5309,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO @@ -4543,9 +5339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4553,9 +5357,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4575,9 +5387,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4597,9 +5417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4638,14 +5466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -4659,9 +5495,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxsspx vs4, o0, AO @@ -4684,9 +5528,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxsspx vs4, o0, AO @@ -4709,9 +5561,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxsspx vs0, o0, AO @@ -4734,9 +5594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4746,9 +5614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4771,9 +5647,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -4796,9 +5680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -4845,14 +5737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO @@ -4865,9 +5765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO @@ -4887,9 +5795,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO @@ -4909,9 +5825,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO @@ -4931,9 +5855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4941,9 +5873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4963,9 +5903,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -4985,9 +5933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5026,14 +5982,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5048,9 +6012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvw4x vs4, o0, AO @@ -5073,9 +6045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvw4x vs4, o0, AO @@ -5098,9 +6078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvw4x vs0, o0, AO @@ -5123,9 +6111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -5134,9 +6130,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5159,9 +6163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5184,9 +6196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO @@ -5220,14 +6240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5240,9 +6268,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO @@ -5261,9 +6297,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO @@ -5282,9 +6326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO @@ -5303,18 +6355,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5333,9 +6401,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5354,9 +6430,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -5382,14 +6466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO @@ -5401,9 +6493,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO @@ -5420,9 +6520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO @@ -5439,9 +6547,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO @@ -5458,17 +6574,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5485,9 +6617,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5504,9 +6644,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -5528,14 +6676,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -5548,9 +6704,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxsspx vs4, o0, AO @@ -5569,9 +6733,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxsspx vs4, o0, AO @@ -5590,9 +6762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxsspx vs0, o0, AO @@ -5611,18 +6791,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5641,9 +6837,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -5662,9 +6866,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -5690,14 +6902,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO @@ -5709,9 +6929,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO @@ -5728,9 +6956,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO @@ -5747,9 +6983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO @@ -5766,17 +7010,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5793,9 +7053,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5812,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -5836,5 +7112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c index 82366902d..3f0af4232 100644 --- a/kernel/power/zasum_microk_power8.c +++ b/kernel/power/zasum_microk_power8.c @@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c index 124614f62..959050e5f 100644 --- a/kernel/power/zaxpy_microk_power8.c +++ b/kernel/power/zaxpy_microk_power8.c @@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, __asm__ ( - "xxspltd 32, %x19, 0 \n\t" // alpha_r - "xxspltd 33, %x20, 0 \n\t" // alpha_i + XXSPLTD_S(32,%x19,0) // alpha_r + XXSPLTD_S(33,%x20,0) // alpha_i "lxvd2x 36, 0, %21 \n\t" // mvec @@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 - "xxswapd %x8, 40 \n\t" // exchange real and imag part - "xxswapd %x9, 41 \n\t" // exchange real and imag part - "xxswapd %x10, 42 \n\t" // exchange real and imag part - "xxswapd %x11, 43 \n\t" // exchange real and imag part + XXSWAPD_S(%x8,40) // exchange real and imag part + XXSWAPD_S(%x9,41) // exchange real and imag part + XXSWAPD_S(%x10,42) // exchange real and imag part + XXSWAPD_S(%x11,43) // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" @@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 - "xxswapd %x12, 44 \n\t" // exchange real and imag part - "xxswapd %x13, 45 \n\t" // exchange real and imag part - "xxswapd %x14, 46 \n\t" // exchange real and imag part - "xxswapd %x15, 47 \n\t" // exchange real and imag part + XXSWAPD_S(%x12,44) // exchange real and imag part + XXSWAPD_S(%x13,45) // exchange real and imag part + XXSWAPD_S(%x14,46) // exchange real and imag part + XXSWAPD_S(%x15,47) // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" @@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "addi %16, %16, 64 \n\t" - "xxswapd %x8, 40 \n\t" // exchange real and imag part - "xxswapd %x9, 41 \n\t" // exchange real and imag part + XXSWAPD_S(%x8,40) // exchange real and imag part + XXSWAPD_S(%x9,41) // exchange real and imag part "lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 49, %22, %3 \n\t" // y1 - "xxswapd %x10, 42 \n\t" // exchange real and imag part - "xxswapd %x11, 43 \n\t" // exchange real and imag part + XXSWAPD_S(%x10,42) // exchange real and imag part + XXSWAPD_S(%x11,43) // exchange real and imag part "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 - "xxswapd %x12, 44 \n\t" // exchange real and imag part + XXSWAPD_S(%x12,44) // exchange real and imag part "addi %3, %3, 64 \n\t" - "xxswapd %x13, 45 \n\t" // exchange real and imag part + XXSWAPD_S(%x13,45) // exchange real and imag part "lxvd2x %x4, 0, %3 \n\t" // y4 "lxvd2x %x5, %22, %3 \n\t" // y5 - "xxswapd %x14, 46 \n\t" // exchange real and imag part - "xxswapd %x15, 47 \n\t" // exchange real and imag part + XXSWAPD_S(%x14,46) // exchange real and imag part + XXSWAPD_S(%x15,47) // exchange real and imag part "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c index 5ca34b633..e29547047 100644 --- a/kernel/power/zcopy_microk_power8.c +++ b/kernel/power/zcopy_microk_power8.c @@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c index 71078b66c..dcde82433 100644 --- a/kernel/power/zdot_microk_power8.c +++ b/kernel/power/zdot_microk_power8.c @@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i - "xxswapd 0, 48 \n\t" // y0_i, y0_r - "xxswapd 1, 49 \n\t" // y1_i, y1_r - "xxswapd 2, 50 \n\t" // y2_i, y2_r - "xxswapd 3, 51 \n\t" // y3_i, y3_r + XXSWAPD_S(0,48) // y0_i, y0_r + XXSWAPD_S(1,49) // y1_i, y1_r + XXSWAPD_S(2,50) // y2_i, y2_r + XXSWAPD_S(3,51) // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" @@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i - "xxswapd 8, 4 \n\t" // y0_i, y0_r - "xxswapd 9, 5 \n\t" // y1_i, y1_r - "xxswapd 10, 6 \n\t" // y2_i, y2_r - "xxswapd 11, 7 \n\t" // y3_i, y3_r + XXSWAPD_S(8,4) // y0_i, y0_r + XXSWAPD_S(9,5) // y1_i, y1_r + XXSWAPD_S(10,6) // y2_i, y2_r + XXSWAPD_S(11,7) // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i @@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i - "xxswapd 0,48 \n\t" // y0_i, y0_r - "xxswapd 1,49 \n\t" // y1_i, y1_r + XXSWAPD_S(0,48) // y0_i, y0_r + XXSWAPD_S(1,49) // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" - "xxswapd 2,50 \n\t" // y2_i, y2_r - "xxswapd 3,51 \n\t" // y3_i, y3_r + XXSWAPD_S(2,50) // y2_i, y2_r + XXSWAPD_S(3,51) // y3_i, y3_r "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i @@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i - "xxswapd 8,4 \n\t" // y0_i, y0_r - "xxswapd 9,5 \n\t" // y1_i, y1_r + XXSWAPD_S(8,4) // y0_i, y0_r + XXSWAPD_S(9,5) // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" - "xxswapd 10,6 \n\t" // y2_i, y2_r - "xxswapd 11,7 \n\t" // y3_i, y3_r + XXSWAPD_S(10,6) // y2_i, y2_r + XXSWAPD_S(11,7) // y3_i, y3_r "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index c43a115b2..24a36470c 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -67,7 +67,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -91,9 +95,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -151,9 +163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -211,9 +231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -271,9 +299,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -311,9 +347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -371,9 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -431,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -455,13 +515,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -479,13 +539,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -503,13 +563,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -527,13 +587,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -551,13 +611,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -575,13 +635,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -599,13 +659,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -623,13 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -685,13 +745,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB - xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB @@ -709,13 +769,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB - xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB @@ -733,13 +793,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB - xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB @@ -757,13 +817,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB - xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB @@ -781,13 +841,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB - xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB @@ -805,13 +865,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB - xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB @@ -829,13 +889,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB - xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB @@ -853,13 +913,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB - xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB @@ -900,14 +960,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -924,9 +992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -961,9 +1037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -998,9 +1082,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1035,9 +1127,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1059,9 +1159,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1096,9 +1204,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1133,9 +1249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -1152,13 +1276,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1176,13 +1300,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1200,13 +1324,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1224,13 +1348,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1273,13 +1397,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -1297,13 +1421,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -1321,13 +1445,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -1345,13 +1469,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -1383,14 +1507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1405,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1432,9 +1572,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1459,9 +1607,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1486,9 +1642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1502,9 +1666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1529,9 +1701,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1556,9 +1736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -1573,13 +1761,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1597,13 +1785,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1640,13 +1828,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1664,13 +1852,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1698,14 +1886,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1719,9 +1915,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1741,9 +1945,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1763,9 +1975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1785,9 +2005,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1797,9 +2025,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1819,9 +2055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1841,9 +2085,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -1857,13 +2109,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1897,13 +2149,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1929,14 +2181,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1958,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1999,9 +2267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2040,9 +2316,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2081,9 +2365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2104,9 +2396,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2145,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2186,9 +2494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -2210,13 +2526,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2234,13 +2550,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2258,13 +2574,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2282,13 +2598,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2306,13 +2622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -2330,13 +2646,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -2354,13 +2670,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -2378,13 +2694,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -2425,14 +2741,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2447,9 +2771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2473,9 +2805,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2499,9 +2839,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2525,9 +2873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2540,9 +2896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2566,9 +2930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2592,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -2611,13 +2991,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2635,13 +3015,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2659,13 +3039,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2683,13 +3063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2721,14 +3101,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2741,9 +3129,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2761,9 +3157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2781,9 +3185,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2801,9 +3213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2812,9 +3232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2832,9 +3260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2852,9 +3288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -2869,13 +3313,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2893,13 +3337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2927,14 +3371,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2946,9 +3398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2963,9 +3423,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2980,9 +3448,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -2997,18 +3473,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3023,9 +3515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3040,9 +3540,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3056,13 +3564,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -3088,11 +3596,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`ZCOPYB_1x1', ` +#else .macro ZCOPYB_1x1 +#endif lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i @@ -3101,10 +3617,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs5, o16, BBO addi BBO, BBO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`ZCOPYB_8x1', ` +#else .macro ZCOPYB_8x1 +#endif lxvd2x vs32, o0, BO lxvd2x vs33, o16, BO @@ -3118,23 +3642,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs39, o48, BO addi BO, BO, 64 - xxspltd vs40, vs32, 0 - xxspltd vs41, vs32, 1 - xxspltd vs42, vs33, 0 - xxspltd vs43, vs33, 1 - xxspltd vs44, vs34, 0 - xxspltd vs45, vs34, 1 - xxspltd vs46, vs35, 0 - xxspltd vs47, vs35, 1 - - xxspltd vs48, vs36, 0 - xxspltd vs49, vs36, 1 - xxspltd vs50, vs37, 0 - xxspltd vs51, vs37, 1 - xxspltd vs52, vs38, 0 - xxspltd vs53, vs38, 1 - xxspltd vs54, vs39, 0 - xxspltd vs55, vs39, 1 + XXSPLTD(vs40,vs32,0) + XXSPLTD(vs41,vs32,1) + XXSPLTD(vs42,vs33,0) + XXSPLTD(vs43,vs33,1) + XXSPLTD(vs44,vs34,0) + XXSPLTD(vs45,vs34,1) + XXSPLTD(vs46,vs35,0) + XXSPLTD(vs47,vs35,1) + + XXSPLTD(vs48,vs36,0) + XXSPLTD(vs49,vs36,1) + XXSPLTD(vs50,vs37,0) + XXSPLTD(vs51,vs37,1) + XXSPLTD(vs52,vs38,0) + XXSPLTD(vs53,vs38,1) + XXSPLTD(vs54,vs39,0) + XXSPLTD(vs55,vs39,1) stxvd2x vs40, o0, BBO stxvd2x vs41, o16, BBO @@ -3160,6 +3684,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs55, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S index 3f5a5ed03..654332375 100644 --- a/kernel/power/zgemm_tcopy_macros_8_power8.S +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs12, o32, T1 stxvd2x vs13, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c index d45468fd5..c6d666178 100644 --- a/kernel/power/zrot.c +++ b/kernel/power/zrot.c @@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si __asm__ ( - "xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords - "xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords + XXSPLTD_S(36,%x[cos],0) // load c to both dwords + XXSPLTD_S(37,%x[sin],0) // load s to both dwords "lxvd2x 32, 0, %[x_ptr] \n\t" // load x "lxvd2x 33, %[i16], %[x_ptr] \n\t" @@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si "addi %[y_ptr], %[y_ptr], 64 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" @@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si "addi %[y_ptr], %[y_ptr], 128 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t" - "bgt+ 1b \n" + "bgt+ one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c index aba9029a0..567331775 100644 --- a/kernel/power/zscal_microk_power8.c +++ b/kernel/power/zscal_microk_power8.c @@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "dcbt 0, %2 \n\t" "xsnegdp 33, %x16 \n\t" // -alpha_i - "xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r - "xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i + XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r + XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 41, %17, %2 \n\t" @@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" @@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" - "xxswapd %x7, 40 \n\t" - "xxswapd %x8, 41 \n\t" - "xxswapd %x9, 42 \n\t" - "xxswapd %x10, 43 \n\t" - "xxswapd %x11, 44 \n\t" - "xxswapd %x12, 45 \n\t" - "xxswapd %x13, 46 \n\t" - "xxswapd %x14, 47 \n\t" + XXSWAPD_S(%x7,40) + XXSWAPD_S(%x8,41) + XXSWAPD_S(%x9,42) + XXSWAPD_S(%x10,43) + XXSWAPD_S(%x11,44) + XXSWAPD_S(%x12,45) + XXSWAPD_S(%x13,46) + XXSWAPD_S(%x14,47) "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x8, %x8, 33 \n\t" @@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "addi %2, %2, 256 \n\t" "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" @@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" - "xxswapd %x7, 40 \n\t" - "xxswapd %x8, 41 \n\t" - "xxswapd %x9, 42 \n\t" - "xxswapd %x10, 43 \n\t" - "xxswapd %x11, 44 \n\t" - "xxswapd %x12, 45 \n\t" - "xxswapd %x13, 46 \n\t" - "xxswapd %x14, 47 \n\t" + XXSWAPD_S(%x7,40) + XXSWAPD_S(%x8,41) + XXSWAPD_S(%x9,42) + XXSWAPD_S(%x10,43) + XXSWAPD_S(%x11,44) + XXSWAPD_S(%x12,45) + XXSWAPD_S(%x13,46) + XXSWAPD_S(%x14,47) "addi %2, %2, -128 \n\t" diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c index 54391ba5d..1e9fbe2cf 100644 --- a/kernel/power/zswap_microk_power8.c +++ b/kernel/power/zswap_microk_power8.c @@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" "lxvd2x 34, %6, %4 \n\t" @@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S index 701ec65c8..b3fbcd220 100644 --- a/kernel/power/ztrmm_macros_8x2_power8.S +++ b/kernel/power/ztrmm_macros_8x2_power8.S @@ -68,7 +68,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -92,9 +96,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -152,9 +164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif xvmaddadp vs32, vs0, vs16 // real*real, imag*real @@ -221,9 +241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -289,9 +317,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -329,9 +365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -449,9 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -473,13 +533,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -497,13 +557,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -521,13 +581,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -545,13 +605,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -569,13 +629,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -593,13 +653,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -617,13 +677,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -641,13 +701,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -703,13 +763,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB - xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB @@ -727,13 +787,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB - xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB @@ -751,13 +811,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB - xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB @@ -775,13 +835,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB - xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB @@ -799,13 +859,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB - xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB @@ -823,13 +883,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB - xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB @@ -847,13 +907,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB - xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB @@ -871,13 +931,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB - xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB @@ -918,14 +978,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -942,9 +1010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -979,9 +1055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1016,9 +1100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1053,9 +1145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1077,9 +1177,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1114,9 +1222,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1151,9 +1267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -1170,13 +1294,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1194,13 +1318,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1218,13 +1342,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1242,13 +1366,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1291,13 +1415,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -1315,13 +1439,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -1339,13 +1463,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -1363,13 +1487,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -1401,14 +1525,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1423,9 +1555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1450,9 +1590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1477,9 +1625,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1504,9 +1660,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1520,9 +1684,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1547,9 +1719,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1574,9 +1754,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -1591,13 +1779,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1615,13 +1803,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1658,13 +1846,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1682,13 +1870,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1716,14 +1904,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1737,9 +1933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1759,9 +1963,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1781,9 +1993,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1803,9 +2023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1815,9 +2043,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1837,9 +2073,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1859,9 +2103,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -1875,13 +2127,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1915,13 +2167,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1947,14 +2199,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1976,9 +2236,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2017,9 +2285,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2058,9 +2334,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2099,9 +2383,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2122,9 +2414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2163,9 +2463,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2204,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -2228,13 +2544,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2252,13 +2568,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2276,13 +2592,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2300,13 +2616,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2324,13 +2640,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -2348,13 +2664,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -2372,13 +2688,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -2396,13 +2712,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -2443,14 +2759,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2465,9 +2789,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2491,9 +2823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2517,9 +2857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2543,9 +2891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2558,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2584,9 +2948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2610,9 +2982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -2629,13 +3009,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2653,13 +3033,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2677,13 +3057,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2701,13 +3081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2739,14 +3119,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2759,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2779,9 +3175,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2799,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2819,9 +3231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2830,9 +3250,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2850,9 +3278,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2870,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -2887,13 +3331,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2911,13 +3355,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2945,14 +3389,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2964,9 +3416,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2981,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2998,9 +3466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3015,18 +3491,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3041,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3058,9 +3558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3074,13 +3582,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -3106,5 +3614,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif From 3635fdbf2bfbb3bd56a7fb3e0c1a1e21ef4d0b72 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Aug 2019 22:52:17 +0200 Subject: [PATCH 683/935] Do not abuse the global ARCH variable as a local temporary Setting it with a simple "uname -m" just to be able to decide whether to compile getarch.c with -march=native may actually keep getarch from doing a proper probe. Fixes #2231, a regression caused by #2110 --- Makefile.system | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index a54282f6c..fe6be79db 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,9 +9,11 @@ ifndef TOPDIR TOPDIR = . endif -# If ARCH is not set, we use the host system's architecture. +# If ARCH is not set, we use the host system's architecture for getarch compile options. ifndef ARCH -ARCH := $(shell uname -m) +HOSTARCH := $(shell uname -m) +else +HOSTARCH = $(ARCH) endif # Catch conflicting usage of ARCH in some BSD environments @@ -143,7 +145,7 @@ endif # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. -ifeq ($(ARCH), x86_64) +ifeq ($(HOSTARCH), x86_64) ifeq ($(findstring pgcc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif From 7d380f7d79abe1a8d7ed6efd56efe677135c2415 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Aug 2019 11:31:20 +0200 Subject: [PATCH 684/935] Fix PGI build options (again) for #2237 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index fe6be79db..2cf1322a9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -699,7 +699,7 @@ endif ifeq ($(C_COMPILER), PGI) ifdef BINARY64 -CCOMMON_OPT += -tp p7-64 +CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm else CCOMMON_OPT += -tp p7 endif From 3a55dca2dceafef421c6198d7dd1876f4bcc5663 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Aug 2019 11:35:31 +0200 Subject: [PATCH 685/935] Make x86_64 zdot compile with PGI and Sun C again broken by #2222 as CREAL,CIMAG do not expand to a valid lvalue with these compilers --- kernel/x86_64/zdot.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 48f855b0e..d11cb764f 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -181,10 +181,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA #if defined(SMP) int nthreads; FLOAT dummy_alpha; + FLOAT zdotr=0., zdoti=0.; #endif OPENBLAS_COMPLEX_FLOAT zdot; - CREAL(zdot) = 0.0; - CIMAG(zdot) = 0.0; + zdot=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); #if defined(SMP) if (inc_x == 0 || inc_y == 0 || n <= 10000) @@ -211,15 +211,17 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA ptr = (OPENBLAS_COMPLEX_FLOAT *)result; for (i = 0; i < nthreads; i++) { - CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); - CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); + zdotr += CREAL(*ptr); + zdoti += CIMAG(*ptr); +// CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); +// CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); ptr = (void *)(((char *)ptr) + sizeof(double) * 2); } + zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(zdotr,zdoti); } #else zdot_compute(n, x, inc_x, y, inc_y, &zdot); #endif - return zdot; } From bf0d92a3105ae0ed67117dcf0977a164ac9e2e7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Aug 2019 17:35:56 +0200 Subject: [PATCH 686/935] Add arch data for cross-compiling to CORE2 for #2235 --- cmake/prebuild.cmake | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index e508a46c2..c2600bd0d 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -106,7 +106,25 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS file(APPEND ${TARGET_CONF_TEMP} "#define ${TCORE}\n" "#define CHAR_CORENAME \"${TCORE}\"\n") - if ("${TCORE}" STREQUAL "ARMV7") + if ("${TCORE}" STREQUAL "CORE2") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L2_SIZE\t1048576\n" + "#define L2_LINESIZE\t64\n" + "#define DTB_DEFAULT_ENTRIES\t256\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_CMOV\n" + "#define HAVE_MMX\n" + "#define HAVE_SSE\n" + "#define HAVE_SSE2\n" + "#define HAVE_SSE3\n" + "#define HAVE_SSSE3\n") + set(SGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 4) + elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_LINESIZE\t32\n" From 11c59acfb1a061f35ec88f11c2176f60b4916e93 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Aug 2019 18:07:44 +0200 Subject: [PATCH 687/935] Keep both PGI/SUN and default code paths to avoid breaking Clang/WIndows --- kernel/x86_64/zdot.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index d11cb764f..01169e8e6 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -181,11 +181,19 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA #if defined(SMP) int nthreads; FLOAT dummy_alpha; +#if defined(C_PGI) || defined(C_SUN) FLOAT zdotr=0., zdoti=0.; +#endif #endif + OPENBLAS_COMPLEX_FLOAT zdot; +#if defined(C_PGI) || defined(C_SUN) zdot=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); - +#else + CREAL(zdot) = 0.0; + CIMAG(zdot) = 0.0; +#endif + #if defined(SMP) if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; @@ -211,17 +219,23 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA ptr = (OPENBLAS_COMPLEX_FLOAT *)result; for (i = 0; i < nthreads; i++) { +#if defined(C_PGI) || defined(C_SUN) zdotr += CREAL(*ptr); zdoti += CIMAG(*ptr); -// CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); -// CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); +#else + CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); + CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); +#endif ptr = (void *)(((char *)ptr) + sizeof(double) * 2); } +#if defined(C_PGI) || defined(C_SUN) zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(zdotr,zdoti); +#endif } #else zdot_compute(n, x, inc_x, y, inc_y, &zdot); #endif + return zdot; } From be09551cdf2efdddb3d671c1355c46560e4610f2 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Thu, 29 Aug 2019 23:22:23 +0000 Subject: [PATCH 688/935] aligned --- kernel/power/caxpy.c | 7 +++---- kernel/power/cdot.c | 6 +++--- kernel/power/cgemv_n.c | 6 +++--- kernel/power/cgemv_t.c | 6 +++--- kernel/power/dgemv_n.c | 2 +- kernel/power/dgemv_t.c | 6 +++--- kernel/power/sgemv_n.c | 3 ++- kernel/power/sgemv_n_8.c | 3 ++- kernel/power/sgemv_t.c | 7 +++---- kernel/power/sgemv_t_8.c | 4 ++-- kernel/power/zgemv_n_4.c | 4 ++-- kernel/power/zgemv_t_4.c | 4 ++-- 12 files changed, 29 insertions(+), 29 deletions(-) diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c index 4bdf13c34..00f2ec5e0 100644 --- a/kernel/power/caxpy.c +++ b/kernel/power/caxpy.c @@ -24,12 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" - - #ifndef HAVE_ASM_KERNEL #include +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { @@ -43,7 +42,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; #endif - __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); register __vector float *vy = (__vector float *) y; register __vector float *vx = (__vector float *) x; BLASLONG i=0; diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index f86a33f22..51d341ada 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -25,12 +25,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" - #ifndef HAVE_KERNEL_8 #include +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) { - __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); register __vector float *vy = (__vector float *) y; register __vector float *vx = (__vector float *) x; BLASLONG i = 0; @@ -96,7 +96,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA BLASLONG i = 0; BLASLONG ix=0, iy=0; OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + FLOAT dot[4] __attribute__((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; if (n <= 0) { CREAL(result) = 0.0; diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index cb01e196e..6a195d6d1 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 -static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { @@ -247,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG m2; BLASLONG m3; BLASLONG n2; - - FLOAT xbuffer[8], *ybuffer; + FLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *ybuffer; if (m < 1) return (0); if (n < 1) return (0); diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c index c646618cf..68bbdd60a 100644 --- a/kernel/power/cgemv_t.c +++ b/kernel/power/cgemv_t.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 #include -static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; @@ -260,8 +260,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG m2; BLASLONG m3; BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; if (m < 1) return (0); if (n < 1) return (0); diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index b458e11fc..1a3d7669c 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG m3; BLASLONG n2; BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8] __attribute__ ((aligned (16)));; + FLOAT xbuffer[8] __attribute__ ((aligned (16))); FLOAT *ybuffer; if ( m < 1 ) return(0); diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index b8589a131..d05d7b7d3 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -581,9 +581,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG m1; BLASLONG m2; BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; if (m < 1) return (0); if (n < 1) return (0); diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 9704757fe..81ac031a3 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -174,7 +174,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n2; BLASLONG lda4 = lda << 2; BLASLONG lda8 = lda << 3; - FLOAT xbuffer[8],*ybuffer; + FLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *ybuffer; if ( m < 1 ) return(0); if ( n < 1 ) return(0); diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c index 9bc93ced6..64696236a 100644 --- a/kernel/power/sgemv_n_8.c +++ b/kernel/power/sgemv_n_8.c @@ -213,7 +213,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n2; BLASLONG lda4 = lda << 2; BLASLONG lda8 = lda << 3; - FLOAT xbuffer[8],*ybuffer; + FLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *ybuffer; if ( m < 1 ) return(0); if ( n < 1 ) return(0); diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 96434a13f..3d8a442dc 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -177,10 +177,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG m1; BLASLONG m2; BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; - + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; if (m < 1) return (0); if (n < 1) return (0); diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index 5e9cd63ac..b90512162 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -204,8 +204,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG m3; BLASLONG n2; - FLOAT ybuffer[8], *xbuffer; - + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; if (m < 1) return (0); if (n < 1) return (0); diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 167b0a158..ba019d6a5 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -614,8 +614,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG m2; BLASLONG m3; BLASLONG n2; - - FLOAT xbuffer[8], *ybuffer; + FLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *ybuffer; if (m < 1) return (0); if (n < 1) return (0); diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 20a0812dd..b34199af6 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -532,8 +532,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG m2; BLASLONG m3; BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; if (m < 1) return (0); if (n < 1) return (0); From e79712d96941c099d2f5e4b11544b2a20d97fbdf Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Fri, 30 Aug 2019 02:52:04 +0000 Subject: [PATCH 689/935] cgemv using vec_vsx_ld instead of letting gcc to decide --- kernel/power/cgemv_n.c | 97 +++++++++++++++++++++++------------------- kernel/power/cgemv_t.c | 97 +++++++++++++++++++++++++++--------------- 2 files changed, 115 insertions(+), 79 deletions(-) diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index 6a195d6d1..e85517ffa 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; #endif - register __vector float *vy = (__vector float *) y; + register __vector float *vptr_y = (__vector float *) y; register __vector float *vptr_a0 = (__vector float *) a0; register __vector float *vptr_a1 = (__vector float *) a1; register __vector float *vptr_a2 = (__vector float *) a2; register __vector float *vptr_a3 = (__vector float *) a3; BLASLONG i = 0; - for (;i< n / 2; i+=2) { - register __vector float vy_0 = vy[i]; - register __vector float vy_1 = vy[i + 1]; - register __vector float va0 = vptr_a0[i]; - register __vector float va1 = vptr_a1[i]; - register __vector float va2 = vptr_a2[i]; - register __vector float va3 = vptr_a3[i]; - register __vector float va0_1 = vptr_a0[i + 1]; - register __vector float va1_1 = vptr_a1[i + 1]; - register __vector float va2_1 = vptr_a2[i + 1]; - register __vector float va3_1 = vptr_a3[i + 1]; + BLASLONG i2=16; + for (;i< n * 8; i+=32,i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; @@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; - vy[i] = vy_0; - vy[i + 1] = vy_1; + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); } } @@ -118,17 +119,19 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; #endif - register __vector float *vy = (__vector float *) y; + register __vector float *vptr_y = (__vector float *) y; register __vector float *vptr_a0 = (__vector float *) a0; register __vector float *vptr_a1 = (__vector float *) a1; - BLASLONG i = 0; - for (;i< n / 2; i+=2) { - register __vector float vy_0 = vy[i]; - register __vector float vy_1 = vy[i + 1]; - register __vector float va0 = vptr_a0[i]; - register __vector float va1 = vptr_a1[i]; - register __vector float va0_1 = vptr_a0[i + 1]; - register __vector float va1_1 = vptr_a1[i + 1]; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va0x = vec_perm(va0, va0,swap_mask); register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); register __vector float va1x = vec_perm(va1, va1,swap_mask); @@ -136,8 +139,8 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; - vy[i] = vy_0; - vy[i + 1] = vy_1; + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); } } @@ -154,21 +157,23 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; #endif - register __vector float *vy = (__vector float *) y; + register __vector float *vptr_y = (__vector float *) y; register __vector float *vptr_a0 = (__vector float *) ap; BLASLONG i = 0; - for (;i< n / 2; i+=2) { - register __vector float vy_0 = vy[i]; - register __vector float vy_1 = vy[i + 1]; - register __vector float va0 = vptr_a0[i]; - register __vector float va0_1 = vptr_a0[i + 1]; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va0x = vec_perm(va0, va0,swap_mask); register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); vy_0 += va0*vx0_r + va0x*vx0_i; vy_1 += va0_1*vx0_r + va0x_1*vx0_i; - vy[i] = vy_0; - vy[i + 1] = vy_1; + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); } } @@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT register __vector float *vptr_src = (__vector float *) src; register __vector float *vptr_y = (__vector float *) dest; - for (i = 0; i < n/2; i += 2 ){ - register __vector float vy_0 = vptr_y[i]; - register __vector float vy_1 = vptr_y[i +1]; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + + + register __vector float vsrc = vec_vsx_ld(i,vptr_src); + register __vector float vsrc_1 = vec_vsx_ld(i2,vptr_src); + + register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask); + register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask); - register __vector float vsrc = vptr_src[i]; - register __vector float vsrc_1 = vptr_src[i + 1]; - register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask); - register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask); + vy_0 += vsrc*valpha_r + vsrcx*valpha_i; + vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; - vy_0 += vsrc*valpha_r + vsrcx*valpha_i; - vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; - vptr_y[i] = vy_0; - vptr_y[i+1 ] = vy_1; + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); } diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c index 68bbdd60a..57eb066b0 100644 --- a/kernel/power/cgemv_t.c +++ b/kernel/power/cgemv_t.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; a0 = ap; a1 = ap + lda; @@ -48,26 +48,39 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* vptr_a2 = (__vector float*) a2; + __vector float* vptr_a3 = (__vector float*) a3; __vector float* v_x = (__vector float*) x; - for (i = 0; i < n / 2; i+=2) { - register __vector float vx_0 = v_x[i]; - register __vector float vx_1 = v_x[i+1]; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; - vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; - vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; - vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; - vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; - vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; - vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; - vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + vtemp2_p += vx_0*va2 + vx_1*va2_1; + vtemp2_r += vxr_0*va2 + vxr_1*va2_1; + vtemp3_p += vx_0*va3 + vx_1*va3_1; + vtemp3_r += vxr_0*va3 + vxr_1*va3_1; } @@ -128,7 +141,7 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; + FLOAT *a0, *a1; a0 = ap; a1 = ap + lda; @@ -138,23 +151,33 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; + + + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; __vector float* v_x = (__vector float*) x; - for (i = 0; i < n / 2; i+=2) { - register __vector float vx_0 = v_x[i]; - register __vector float vx_1 = v_x[i+1]; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; - vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; - vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; - vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - } + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + + } #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; @@ -193,23 +216,27 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - __vector float* va0 = (__vector float*) ap; + __vector float* vptr_a0 = (__vector float*) ap; __vector float* v_x = (__vector float*) x; - - for (i = 0; i < n / 2; i+=2) { - register __vector float vx_0 = v_x[i]; - register __vector float vx_1 = v_x[i+1]; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; - vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; } #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) From 4c22828812a9d5f0962c836d4c8bf486fde4d9cb Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Fri, 30 Aug 2019 04:09:15 +0000 Subject: [PATCH 690/935] caxpy and cdot are using vec_vsx_ld --- kernel/power/caxpy.c | 67 ++++++++++++++++++++++++++------------------ kernel/power/cdot.c | 52 +++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 50 deletions(-) diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c index 00f2ec5e0..0545766b1 100644 --- a/kernel/power/caxpy.c +++ b/kernel/power/caxpy.c @@ -27,6 +27,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifndef HAVE_ASM_KERNEL #include + +#define offset_0 0 +#define offset_1 16 +#define offset_2 32 +#define offset_3 48 +#define offset_4 64 +#define offset_5 80 +#define offset_6 96 +#define offset_7 112 + static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) @@ -43,27 +53,28 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT #endif __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - register __vector float *vy = (__vector float *) y; - register __vector float *vx = (__vector float *) x; + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_x = (__vector float *) x; BLASLONG i=0; - for (; i < n/2; i += 8) { + for(;i + +#define offset_0 0 +#define offset_1 16 +#define offset_2 32 +#define offset_3 48 + + + static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) { __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - register __vector float *vy = (__vector float *) y; - register __vector float *vx = (__vector float *) x; - BLASLONG i = 0; + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_x = (__vector float *) x; register __vector float vd_0 = { 0 }; register __vector float vd_1 = { 0 }; register __vector float vd_2 = { 0 }; @@ -41,26 +48,23 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) register __vector float vdd_0 = { 0 }; register __vector float vdd_1 = { 0 }; register __vector float vdd_2 = { 0 }; - register __vector float vdd_3 = { 0 }; - for (; i < n/2; i += 4) { - - register __vector float vyy_0 ; - register __vector float vyy_1 ; - register __vector float vyy_2 ; - register __vector float vyy_3 ; - - register __vector float vy_0 = vy[i]; - register __vector float vy_1 = vy[i + 1]; - register __vector float vy_2 = vy[i + 2]; - register __vector float vy_3 = vy[i + 3]; - register __vector float vx_0= vx[i]; - register __vector float vx_1 = vx[i + 1]; - register __vector float vx_2 = vx[i + 2]; - register __vector float vx_3 = vx[i + 3]; - vyy_0 = vec_perm(vy_0, vy_0, swap_mask); - vyy_1 = vec_perm(vy_1, vy_1, swap_mask); - vyy_2 = vec_perm(vy_2, vy_2, swap_mask); - vyy_3 = vec_perm(vy_3, vy_3, swap_mask); + register __vector float vdd_3 = { 0 }; + BLASLONG i=0; + for(;i Date: Fri, 30 Aug 2019 11:14:55 +0000 Subject: [PATCH 691/935] fix uninitialized variables i --- kernel/power/cgemv_n.c | 4 ++-- kernel/power/cgemv_t.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index e85517ffa..eec3fa37c 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -181,7 +181,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; + BLASLONG i=0; if (inc_dest != 2) { @@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { - BLASLONG i; + BLASLONG i=0; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c index 57eb066b0..691f7a3d3 100644 --- a/kernel/power/cgemv_t.c +++ b/kernel/power/cgemv_t.c @@ -276,8 +276,8 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { } int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; + BLASLONG i=0; + BLASLONG j=0; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; From b5af7b9c7808f06d6c10c2b2db90a054ab746970 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 31 Aug 2019 18:06:12 +0200 Subject: [PATCH 692/935] Disable ppc64le test environment on Travis CI as this semi-official beta option has suddenly reverted to a standard x86_64 environment causing spurious failures --- .travis.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index a92bb0687..27ecba6c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,14 +25,14 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" - - <<: *test-ubuntu - os: linux-ppc64le - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" - env: - # for matrix annotation only - - TARGET_BOX=PPC64LE_LINUX - - BTYPE="BINARY=64 USE_OPENMP=1" + # - <<: *test-ubuntu + # os: linux-ppc64le + # before_script: + # - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + # env: + # # for matrix annotation only + # - TARGET_BOX=PPC64LE_LINUX + # - BTYPE="BINARY=64 USE_OPENMP=1" - <<: *test-ubuntu env: From 1fec0570f6b0561a52d72e5d37bbae5fb8d467cb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Sep 2019 15:03:45 +0200 Subject: [PATCH 693/935] Add cgemm and zgemm unroll factors for core2 --- cmake/prebuild.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index c2600bd0d..2fe168a1c 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -124,6 +124,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 4) + set(CGEMM_DEFAULT_UNROLL_M 4) + set(CGEMM_DEFAULT_UNROLL_N 2) + set(ZGEMM_DEFAULT_UNROLL_M 2) + set(ZGEMM_DEFAULT_UNROLL_N 2) elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" From fde8a8e6a02b1186a178f8bbe3b3e5d84c8786e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Sep 2019 22:41:17 +0200 Subject: [PATCH 694/935] Improve cmake build behaviour with non-host cpu targets (#2246) 1. Supply appropriate values for C/Z GEMM unroll when cross-compiling for CORE2 or ARMV7 2. Add the required xLOCAL_BUFFER_SIZE parameters for cross-compiling CORE2 3. Add -DFORCE_ option to getarch when building with -DTARGET=target for #2245 --- cmake/prebuild.cmake | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 2fe168a1c..da185db5a 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -105,6 +105,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS # Perhaps this should be inside a different file as it grows larger file(APPEND ${TARGET_CONF_TEMP} "#define ${TCORE}\n" + "#define CORE_${TCORE}\n" "#define CHAR_CORENAME \"${TCORE}\"\n") if ("${TCORE}" STREQUAL "CORE2") file(APPEND ${TARGET_CONF_TEMP} @@ -119,15 +120,19 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define HAVE_SSE\n" "#define HAVE_SSE2\n" "#define HAVE_SSE3\n" - "#define HAVE_SSSE3\n") + "#define HAVE_SSSE3\n" + "#define SLOCAL_BUFFER_SIZE\t16384\n" + "#define DLOCAL_BUFFER_SIZE\t16384\n" + "#define CLOCAL_BUFFER_SIZE\t16384\n" + "#define ZLOCAL_BUFFER_SIZE\t16384\n") set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 4) - set(CGEMM_DEFAULT_UNROLL_M 4) - set(CGEMM_DEFAULT_UNROLL_N 2) - set(ZGEMM_DEFAULT_UNROLL_M 2) - set(ZGEMM_DEFAULT_UNROLL_N 2) + set(CGEMM_UNROLL_M 4) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" @@ -143,6 +148,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) elseif ("${TCORE}" STREQUAL "ARMV8") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t32768\n" @@ -331,6 +340,9 @@ else(NOT CMAKE_CROSSCOMPILING) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) else() list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) + if (DEFINED TARGET_CORE) + set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE}) + endif () endif () if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") From 4de545aa7da84c6bbb5d2d843d91a4900ad9a3e1 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sat, 7 Sep 2019 10:21:08 +0300 Subject: [PATCH 696/935] address minor warnings from gcc7 --- driver/others/openblas_get_config.c | 8 ++++---- lapack/trtri/trtri_L_parallel.c | 6 +++--- lapack/trtri/trtri_U_parallel.c | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 81648fb7c..7fefee33d 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -78,10 +78,10 @@ char tmpstr[20]; #ifdef DYNAMIC_ARCH strcat(tmp_config_str, gotoblas_corename()); #endif -if (openblas_get_parallel() == 0) - sprintf(tmpstr, " SINGLE_THREADED"); -else - snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); + if (openblas_get_parallel() == 0) + sprintf(tmpstr, " SINGLE_THREADED"); + else + snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); strcat(tmp_config_str, tmpstr); return tmp_config_str; } diff --git a/lapack/trtri/trtri_L_parallel.c b/lapack/trtri/trtri_L_parallel.c index 5dc60b862..fb8c8fc77 100644 --- a/lapack/trtri/trtri_L_parallel.c +++ b/lapack/trtri/trtri_L_parallel.c @@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG n, info; BLASLONG bk, i, blocking, start_i; int mode; - BLASLONG lda, range_N[2]; + BLASLONG lda; // , range_N[2]; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; @@ -100,8 +100,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, bk = n - i; if (bk > blocking) bk = blocking; - range_N[0] = i; - range_N[1] = i + bk; + /* range_N[0] = i; + range_N[1] = i + bk; */ newarg.lda = lda; newarg.ldb = lda; diff --git a/lapack/trtri/trtri_U_parallel.c b/lapack/trtri/trtri_U_parallel.c index fc48a33f1..5287421d6 100644 --- a/lapack/trtri/trtri_U_parallel.c +++ b/lapack/trtri/trtri_U_parallel.c @@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG n, info; BLASLONG bk, i, blocking; int mode; - BLASLONG lda, range_N[2]; + BLASLONG lda; //, range_N[2]; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; @@ -96,8 +96,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, bk = n - i; if (bk > blocking) bk = blocking; - range_N[0] = i; - range_N[1] = i + bk; + /* range_N[0] = i; + range_N[1] = i + bk; */ newarg.lda = lda; newarg.ldb = lda; From ea747cf933a3ab6b82fbb726a51c70d34b3b91dc Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 30 Aug 2019 15:06:38 -0400 Subject: [PATCH 697/935] start working on ?trtrs --- common_macro.h | 117 ++++++++++++++++++++++++++- interface/lapack/trtrs.c | 171 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+), 4 deletions(-) create mode 100644 interface/lapack/trtrs.c diff --git a/common_macro.h b/common_macro.h index d2503aa65..e8a4a66ed 100644 --- a/common_macro.h +++ b/common_macro.h @@ -641,7 +641,7 @@ #define IMATCOPY_K_CT DIMATCOPY_K_CT #define IMATCOPY_K_RT DIMATCOPY_K_RT -#define GEADD_K DGEADD_K +#define GEADD_K DGEADD_K #else #define AMAX_K SAMAX_K @@ -944,7 +944,7 @@ #define IMATCOPY_K_CT SIMATCOPY_K_CT #define IMATCOPY_K_RT SIMATCOPY_K_RT -#define GEADD_K SGEADD_K +#define GEADD_K SGEADD_K #endif #else #ifdef XDOUBLE @@ -1770,7 +1770,7 @@ #define IMATCOPY_K_CTC ZIMATCOPY_K_CTC #define IMATCOPY_K_RTC ZIMATCOPY_K_RTC -#define GEADD_K ZGEADD_K +#define GEADD_K ZGEADD_K #else @@ -2193,7 +2193,7 @@ #define IMATCOPY_K_CTC CIMATCOPY_K_CTC #define IMATCOPY_K_RTC CIMATCOPY_K_RTC -#define GEADD_K CGEADD_K +#define GEADD_K CGEADD_K #endif #endif @@ -2806,3 +2806,112 @@ typedef struct { #endif #endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define TRTRS_UNU_SINGLE qtrtrs_UNU_single +#define TRTRS_UNN_SINGLE qtrtrs_UNN_single +#define TRTRS_UTU_SINGLE qtrtrs_UTU_single +#define TRTRS_UTN_SINGLE qtrtrs_UTN_single +#define TRTRS_LNU_SINGLE qtrtrs_LNU_single +#define TRTRS_LNN_SINGLE qtrtrs_LNN_single +#define TRTRS_LTU_SINGLE qtrtrs_LTU_single +#define TRTRS_LTN_SINGLE qtrtrs_LTN_single +#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel +#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel +#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel +#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel +#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel +#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel +#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel +#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel + +#elif defined(DOUBLE) +#define TRTRS_UNU_SINGLE dtrtrs_UNU_single +#define TRTRS_UNN_SINGLE dtrtrs_UNN_single +#define TRTRS_UTU_SINGLE dtrtrs_UTU_single +#define TRTRS_UTN_SINGLE dtrtrs_UTN_single +#define TRTRS_LNU_SINGLE dtrtrs_LNU_single +#define TRTRS_LNN_SINGLE dtrtrs_LNN_single +#define TRTRS_LTU_SINGLE dtrtrs_LTU_single +#define TRTRS_LTN_SINGLE dtrtrs_LTN_single +#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel +#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel +#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel +#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel +#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel +#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel +#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel +#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel +#else +#define TRTRS_UNU_SINGLE strtrs_UNU_single +#define TRTRS_UNN_SINGLE strtrs_UNN_single +#define TRTRS_UTU_SINGLE strtrs_UTU_single +#define TRTRS_UTN_SINGLE strtrs_UTN_single +#define TRTRS_LNU_SINGLE strtrs_LNU_single +#define TRTRS_LNN_SINGLE strtrs_LNN_single +#define TRTRS_LTU_SINGLE strtrs_LTU_single +#define TRTRS_LTN_SINGLE strtrs_LTN_single +#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel +#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel +#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel +#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel +#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel +#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel +#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel +#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel +#endif +#else +#ifdef XDOUBLE +#define TRTRS_UNU_SINGLE xtrtrs_UNU_single +#define TRTRS_UNN_SINGLE xtrtrs_UNN_single +#define TRTRS_UTU_SINGLE xtrtrs_UTU_single +#define TRTRS_UTN_SINGLE xtrtrs_UTN_single +#define TRTRS_LNU_SINGLE xtrtrs_LNU_single +#define TRTRS_LNN_SINGLE xtrtrs_LNN_single +#define TRTRS_LTU_SINGLE xtrtrs_LTU_single +#define TRTRS_LTN_SINGLE xtrtrs_LTN_single +#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel +#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel +#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel +#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel +#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel +#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel +#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel +#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel +#elif defined(DOUBLE) +#define TRTRS_UNU_SINGLE ztrtrs_UNU_single +#define TRTRS_UNN_SINGLE ztrtrs_UNN_single +#define TRTRS_UTU_SINGLE ztrtrs_UTU_single +#define TRTRS_UTN_SINGLE ztrtrs_UTN_single +#define TRTRS_LNU_SINGLE ztrtrs_LNU_single +#define TRTRS_LNN_SINGLE ztrtrs_LNN_single +#define TRTRS_LTU_SINGLE ztrtrs_LTU_single +#define TRTRS_LTN_SINGLE ztrtrs_LTN_single +#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel +#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel +#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel +#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel +#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel +#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel +#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel +#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel +#else +#define TRTRS_UNU_SINGLE ctrtrs_UNU_single +#define TRTRS_UNN_SINGLE ctrtrs_UNN_single +#define TRTRS_UTU_SINGLE ctrtrs_UTU_single +#define TRTRS_UTN_SINGLE ctrtrs_UTN_single +#define TRTRS_LNU_SINGLE ctrtrs_LNU_single +#define TRTRS_LNN_SINGLE ctrtrs_LNN_single +#define TRTRS_LTU_SINGLE ctrtrs_LTU_single +#define TRTRS_LTN_SINGLE ctrtrs_LTN_single +#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel +#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel +#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel +#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel +#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel +#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel +#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel +#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel +#endif +#endif diff --git a/interface/lapack/trtrs.c b/interface/lapack/trtrs.c new file mode 100644 index 000000000..261b07ec6 --- /dev/null +++ b/interface/lapack/trtrs.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRTRS" +#elif defined(DOUBLE) +#define ERROR_NAME "DTRTRS" +#else +#define ERROR_NAME "STRTRS" +#endif + +static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE, +}; + +#ifdef SMP +static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL, +}; +#endif + +int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, blasint *Info){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blas_arg_t args; + + blasint info; + int uplo, trans, diag; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + + info = 0; + + TOUPPER(trans_arg); + trans = -1; + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + if (args.ldb < MAX(1, args.m)) info = 7; + if (args.lda < MAX(1, args.m)) info = 9; + if (args.n < 0) info = 5; + if (args.m < 0) info = 4; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + if (diag < 0) info = 3; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = 0; + + if (args.m == 0 || args.n == 0) return 0; + + if (diag) { + if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.n, args.a, args.lda + 1); + return 0; + } + } + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + (trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + (trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); + + IDEBUG_END; + + return 0; + +} From 733d97b2df64d6e5674a5a6b673254584b1d75af Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 30 Aug 2019 16:31:25 -0400 Subject: [PATCH 698/935] add files --- lapack/trtrs/Makefile | 308 ++++++++++++++++++++++++++++++++++++ lapack/trtrs/trtrs_single.c | 68 ++++++++ 2 files changed, 376 insertions(+) create mode 100644 lapack/trtrs/Makefile create mode 100644 lapack/trtrs/trtrs_single.c diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile new file mode 100644 index 000000000..6f41a9319 --- /dev/null +++ b/lapack/trtrs/Makefile @@ -0,0 +1,308 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = strtrs_UNU_single.$(SUFFIX) strtrs_UNN_single.$(SUFFIX) strtrs_UTU_single.$(SUFFIX) strtrs_UTN_single.$(SUFFIX) strtrs_LNU_single.$(SUFFIX) strtrs_LNN_single.$(SUFFIX) strtrs_LTU_single.$(SUFFIX) strtrs_LTN_single.$(SUFFIX) +DBLASOBJS = dtrtrs_UNU_single.$(SUFFIX) dtrtrs_UNN_single.$(SUFFIX) dtrtrs_UTU_single.$(SUFFIX) dtrtrs_UTN_single.$(SUFFIX) dtrtrs_LNU_single.$(SUFFIX) dtrtrs_LNN_single.$(SUFFIX) dtrtrs_LTU_single.$(SUFFIX) dtrtrs_LTN_single.$(SUFFIX) +QBLASOBJS = qtrtrs_UNU_single.$(SUFFIX) qtrtrs_UNN_single.$(SUFFIX) qtrtrs_UTU_single.$(SUFFIX) qtrtrs_UTN_single.$(SUFFIX) qtrtrs_LNU_single.$(SUFFIX) qtrtrs_LNN_single.$(SUFFIX) qtrtrs_LTU_single.$(SUFFIX) qtrtrs_LTN_single.$(SUFFIX) +CBLASOBJS = cgetrs_N_single.$(SUFFIX) cgetrs_T_single.$(SUFFIX) cgetrs_R_single.$(SUFFIX) cgetrs_C_single.$(SUFFIX) +ZBLASOBJS = zgetrs_N_single.$(SUFFIX) zgetrs_T_single.$(SUFFIX) zgetrs_R_single.$(SUFFIX) zgetrs_C_single.$(SUFFIX) +XBLASOBJS = xgetrs_N_single.$(SUFFIX) xgetrs_T_single.$(SUFFIX) xgetrs_R_single.$(SUFFIX) xgetrs_C_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += sgetrs_N_parallel.$(SUFFIX) sgetrs_T_parallel.$(SUFFIX) +DBLASOBJS += dgetrs_N_parallel.$(SUFFIX) dgetrs_T_parallel.$(SUFFIX) +QBLASOBJS += qgetrs_N_parallel.$(SUFFIX) qgetrs_T_parallel.$(SUFFIX) +CBLASOBJS += cgetrs_N_parallel.$(SUFFIX) cgetrs_T_parallel.$(SUFFIX) cgetrs_R_parallel.$(SUFFIX) cgetrs_C_parallel.$(SUFFIX) +ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_parallel.$(SUFFIX) zgetrs_C_parallel.$(SUFFIX) +XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX) +endif + +strtrs_UNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +strtrs_UNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +strtrs_UTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +strtrs_UTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +strtrs_LNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +strtrs_LNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +strtrs_LTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +strtrs_LTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +strtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +strtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +strtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +strtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +strtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +strtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +strtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +strtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +dtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +dtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +dtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +dtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +dtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +dtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +dtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +dtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +dtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +dtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +dtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +dtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +dtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +dtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +dtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +dtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +qtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +qtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ctrtrs_UNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +ctrtrs_UNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +ctrtrs_UTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +ctrtrs_UTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +ctrtrs_LNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +ctrtrs_LNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +ctrtrs_LTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ctrtrs_LTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ctrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +ctrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +ctrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +ctrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +ctrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +ctrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +ctrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ctrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ztrtrs_UNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +ztrtrs_UNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +ztrtrs_UTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +ztrtrs_UTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +ztrtrs_LNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +ztrtrs_LNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +ztrtrs_LTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ztrtrs_LTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ztrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +ztrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +ztrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +ztrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +ztrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +ztrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +ztrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +ztrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +qtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) + +qtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) + +qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/trtrs/trtrs_single.c b/lapack/trtrs/trtrs_single.c new file mode 100644 index 000000000..0dbb03869 --- /dev/null +++ b/lapack/trtrs/trtrs_single.c @@ -0,0 +1,68 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + +#ifndef TRANS + LASWP_PLUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM_LNUN (args, range_m, range_n, sa, sb, 0); + } + +#else + + if (args -> n == 1){ + TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LTUN (args, range_m, range_n, sa, sb, 0); + TRSM_LTLU (args, range_m, range_n, sa, sb, 0); + } + + LASWP_MINUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#endif + + return 0; } From a4f17a9297c444180be4f2a73cc2940ddda7b00f Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Mon, 2 Sep 2019 21:15:20 -0400 Subject: [PATCH 699/935] add missing objects --- lapack/Makefile | 2 +- lapack/trtrs/Makefile | 210 +++++++++++++++++++++--------------------- 2 files changed, 106 insertions(+), 106 deletions(-) diff --git a/lapack/Makefile b/lapack/Makefile index aff5209d5..2bbb4603f 100644 --- a/lapack/Makefile +++ b/lapack/Makefile @@ -2,7 +2,7 @@ TOPDIR = .. include ../Makefile.system #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs -SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri +SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile index 6f41a9319..400b8b653 100644 --- a/lapack/trtrs/Makefile +++ b/lapack/trtrs/Makefile @@ -4,17 +4,17 @@ include ../../Makefile.system SBLASOBJS = strtrs_UNU_single.$(SUFFIX) strtrs_UNN_single.$(SUFFIX) strtrs_UTU_single.$(SUFFIX) strtrs_UTN_single.$(SUFFIX) strtrs_LNU_single.$(SUFFIX) strtrs_LNN_single.$(SUFFIX) strtrs_LTU_single.$(SUFFIX) strtrs_LTN_single.$(SUFFIX) DBLASOBJS = dtrtrs_UNU_single.$(SUFFIX) dtrtrs_UNN_single.$(SUFFIX) dtrtrs_UTU_single.$(SUFFIX) dtrtrs_UTN_single.$(SUFFIX) dtrtrs_LNU_single.$(SUFFIX) dtrtrs_LNN_single.$(SUFFIX) dtrtrs_LTU_single.$(SUFFIX) dtrtrs_LTN_single.$(SUFFIX) QBLASOBJS = qtrtrs_UNU_single.$(SUFFIX) qtrtrs_UNN_single.$(SUFFIX) qtrtrs_UTU_single.$(SUFFIX) qtrtrs_UTN_single.$(SUFFIX) qtrtrs_LNU_single.$(SUFFIX) qtrtrs_LNN_single.$(SUFFIX) qtrtrs_LTU_single.$(SUFFIX) qtrtrs_LTN_single.$(SUFFIX) -CBLASOBJS = cgetrs_N_single.$(SUFFIX) cgetrs_T_single.$(SUFFIX) cgetrs_R_single.$(SUFFIX) cgetrs_C_single.$(SUFFIX) -ZBLASOBJS = zgetrs_N_single.$(SUFFIX) zgetrs_T_single.$(SUFFIX) zgetrs_R_single.$(SUFFIX) zgetrs_C_single.$(SUFFIX) -XBLASOBJS = xgetrs_N_single.$(SUFFIX) xgetrs_T_single.$(SUFFIX) xgetrs_R_single.$(SUFFIX) xgetrs_C_single.$(SUFFIX) +CBLASOBJS = ctrtrs_UNU_single.$(SUFFIX) ctrtrs_UNN_single.$(SUFFIX) ctrtrs_UTU_single.$(SUFFIX) ctrtrs_UTN_single.$(SUFFIX) ctrtrs_URU_single.$(SUFFIX) ctrtrs_URN_single.$(SUFFIX) ctrtrs_UCU_single.$(SUFFIX) ctrtrs_UCN_single.$(SUFFIX) ctrtrs_LNU_single.$(SUFFIX) ctrtrs_LNN_single.$(SUFFIX) ctrtrs_LTU_single.$(SUFFIX) ctrtrs_LTN_single.$(SUFFIX) ctrtrs_LRU_single.$(SUFFIX) ctrtrs_LRN_single.$(SUFFIX) ctrtrs_LCU_single.$(SUFFIX) ctrtrs_LCN_single.$(SUFFIX) +ZBLASOBJS = ztrtrs_UNU_single.$(SUFFIX) ztrtrs_UNN_single.$(SUFFIX) ztrtrs_UTU_single.$(SUFFIX) ztrtrs_UTN_single.$(SUFFIX) ztrtrs_URU_single.$(SUFFIX) ztrtrs_URN_single.$(SUFFIX) ztrtrs_UCU_single.$(SUFFIX) ztrtrs_UCN_single.$(SUFFIX) ztrtrs_LNU_single.$(SUFFIX) ztrtrs_LNN_single.$(SUFFIX) ztrtrs_LTU_single.$(SUFFIX) ztrtrs_LTN_single.$(SUFFIX) ztrtrs_LRU_single.$(SUFFIX) ztrtrs_LRN_single.$(SUFFIX) ztrtrs_LCU_single.$(SUFFIX) ztrtrs_LCN_single.$(SUFFIX) +XBLASOBJS = xtrtrs_UNU_single.$(SUFFIX) xtrtrs_UNN_single.$(SUFFIX) xtrtrs_UTU_single.$(SUFFIX) xtrtrs_UTN_single.$(SUFFIX) xtrtrs_URU_single.$(SUFFIX) xtrtrs_URN_single.$(SUFFIX) xtrtrs_UCU_single.$(SUFFIX) xtrtrs_UCN_single.$(SUFFIX) xtrtrs_LNU_single.$(SUFFIX) xtrtrs_LNN_single.$(SUFFIX) xtrtrs_LTU_single.$(SUFFIX) xtrtrs_LTN_single.$(SUFFIX) xtrtrs_LRU_single.$(SUFFIX) xtrtrs_LRN_single.$(SUFFIX) xtrtrs_LCU_single.$(SUFFIX) xtrtrs_LCN_single.$(SUFFIX) ifdef SMP -SBLASOBJS += sgetrs_N_parallel.$(SUFFIX) sgetrs_T_parallel.$(SUFFIX) -DBLASOBJS += dgetrs_N_parallel.$(SUFFIX) dgetrs_T_parallel.$(SUFFIX) -QBLASOBJS += qgetrs_N_parallel.$(SUFFIX) qgetrs_T_parallel.$(SUFFIX) -CBLASOBJS += cgetrs_N_parallel.$(SUFFIX) cgetrs_T_parallel.$(SUFFIX) cgetrs_R_parallel.$(SUFFIX) cgetrs_C_parallel.$(SUFFIX) -ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_parallel.$(SUFFIX) zgetrs_C_parallel.$(SUFFIX) -XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX) +SBLASOBJS += strtrs_UNU_parallel.$(SUFFIX) strtrs_UNN_parallel.$(SUFFIX) strtrs_UTU_parallel.$(SUFFIX) strtrs_UTN_parallel.$(SUFFIX) strtrs_LNU_parallel.$(SUFFIX) strtrs_LNN_parallel.$(SUFFIX) strtrs_LTU_parallel.$(SUFFIX) strtrs_LTN_parallel.$(SUFFIX) +DBLASOBJS += dtrtrs_UNU_parallel.$(SUFFIX) dtrtrs_UNN_parallel.$(SUFFIX) dtrtrs_UTU_parallel.$(SUFFIX) dtrtrs_UTN_parallel.$(SUFFIX) dtrtrs_LNU_parallel.$(SUFFIX) dtrtrs_LNN_parallel.$(SUFFIX) dtrtrs_LTU_parallel.$(SUFFIX) dtrtrs_LTN_parallel.$(SUFFIX) +QBLASOBJS += qtrtrs_UNU_parallel.$(SUFFIX) qtrtrs_UNN_parallel.$(SUFFIX) qtrtrs_UTU_parallel.$(SUFFIX) qtrtrs_UTN_parallel.$(SUFFIX) qtrtrs_LNU_parallel.$(SUFFIX) qtrtrs_LNN_parallel.$(SUFFIX) qtrtrs_LTU_parallel.$(SUFFIX) qtrtrs_LTN_parallel.$(SUFFIX) +CBLASOBJS += ctrtrs_UNU_parallel.$(SUFFIX) ctrtrs_UNN_parallel.$(SUFFIX) ctrtrs_UTU_parallel.$(SUFFIX) ctrtrs_UTN_parallel.$(SUFFIX) ctrtrs_URU_parallel.$(SUFFIX) ctrtrs_URN_parallel.$(SUFFIX) ctrtrs_UCU_parallel.$(SUFFIX) ctrtrs_UCN_parallel.$(SUFFIX) ctrtrs_LNU_parallel.$(SUFFIX) ctrtrs_LNN_parallel.$(SUFFIX) ctrtrs_LTU_parallel.$(SUFFIX) ctrtrs_LTN_parallel.$(SUFFIX) ctrtrs_LRU_parallel.$(SUFFIX) ctrtrs_LRN_parallel.$(SUFFIX) ctrtrs_LCU_parallel.$(SUFFIX) ctrtrs_LCN_parallel.$(SUFFIX) +ZBLASOBJS += ztrtrs_UNU_parallel.$(SUFFIX) ztrtrs_UNN_parallel.$(SUFFIX) ztrtrs_UTU_parallel.$(SUFFIX) ztrtrs_UTN_parallel.$(SUFFIX) ztrtrs_URU_parallel.$(SUFFIX) ztrtrs_URN_parallel.$(SUFFIX) ztrtrs_UCU_parallel.$(SUFFIX) ztrtrs_UCN_parallel.$(SUFFIX) ztrtrs_LNU_parallel.$(SUFFIX) ztrtrs_LNN_parallel.$(SUFFIX) ztrtrs_LTU_parallel.$(SUFFIX) ztrtrs_LTN_parallel.$(SUFFIX) ztrtrs_LRU_parallel.$(SUFFIX) ztrtrs_LRN_parallel.$(SUFFIX) ztrtrs_LCU_parallel.$(SUFFIX) ztrtrs_LCN_parallel.$(SUFFIX) +XBLASOBJS += xtrtrs_UNU_parallel.$(SUFFIX) xtrtrs_UNN_parallel.$(SUFFIX) xtrtrs_UTU_parallel.$(SUFFIX) xtrtrs_UTN_parallel.$(SUFFIX) xtrtrs_URU_parallel.$(SUFFIX) xtrtrs_URN_parallel.$(SUFFIX) xtrtrs_UCU_parallel.$(SUFFIX) xtrtrs_UCN_parallel.$(SUFFIX) xtrtrs_LNU_parallel.$(SUFFIX) xtrtrs_LNN_parallel.$(SUFFIX) xtrtrs_LTU_parallel.$(SUFFIX) xtrtrs_LTN_parallel.$(SUFFIX) xtrtrs_LRU_parallel.$(SUFFIX) xtrtrs_LRN_parallel.$(SUFFIX) xtrtrs_LCU_parallel.$(SUFFIX) xtrtrs_LCN_parallel.$(SUFFIX) endif strtrs_UNU_single.$(SUFFIX) : trtrs_single.c @@ -161,148 +161,148 @@ qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) -ctrtrs_UNU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) +ctrtrs_UNU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) -ctrtrs_UNN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) +ctrtrs_UNN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) -ctrtrs_UTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) +ctrtrs_UTU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) -ctrtrs_UTN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) +ctrtrs_UTN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) -ctrtrs_LNU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) +ctrtrs_URU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) -ctrtrs_LNN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) +ctrtrs_URN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) -ctrtrs_LTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ctrtrs_UCU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) -ctrtrs_LTN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ctrtrs_UCN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) -ctrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) +ctrtrs_LNU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) -ctrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) +ctrtrs_LNN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) -ctrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) +ctrtrs_LTU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) -ctrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) +ctrtrs_LTN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) -ctrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) +ctrtrs_LRU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) -ctrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) +ctrtrs_LRN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) -ctrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ctrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) -ctrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ctrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) -ztrtrs_UNU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) +ztrtrs_UNU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) -ztrtrs_UNN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) +ztrtrs_UNN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) -ztrtrs_UTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) +ztrtrs_UTU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) -ztrtrs_UTN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) +ztrtrs_UTN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) -ztrtrs_LNU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) +ztrtrs_URU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) -ztrtrs_LNN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) +ztrtrs_URN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) -ztrtrs_LTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ztrtrs_UCU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) -ztrtrs_LTN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ztrtrs_UCN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) -ztrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) +ztrtrs_LNU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) -ztrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) +ztrtrs_LNN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) -ztrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) +ztrtrs_LTU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) -ztrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) +ztrtrs_LTN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) -ztrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) +ztrtrs_LRU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) -ztrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) +ztrtrs_LRN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) -ztrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ztrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) -ztrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +ztrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) -qtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) +xtrtrs_UNU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) -qtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) +xtrtrs_UNN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) -qtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) +xtrtrs_UTU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) -qtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) +xtrtrs_UTN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) -qtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) +xtrtrs_URU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) -qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) +xtrtrs_URN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) -qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +xtrtrs_UCU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) -qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +xtrtrs_UCN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) -qtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) +xtrtrs_LNU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) -qtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F) +xtrtrs_LNN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) -qtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F) +xtrtrs_LTU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) -qtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F) +xtrtrs_LTN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) -qtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F) +xtrtrs_LRU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) -qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) +xtrtrs_LRN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) -qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +xtrtrs_LCU_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) -qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) +xtrtrs_LCN_single.$(SUFFIX) : xtrtrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) include ../../Makefile.tail From 42203dafdcb8e2ab1bd9d68ede5ed78769e2b7a1 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Mon, 2 Sep 2019 21:57:28 -0400 Subject: [PATCH 700/935] add logic --- lapack/trtrs/trtrs_single.c | 79 ++++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/lapack/trtrs/trtrs_single.c b/lapack/trtrs/trtrs_single.c index 0dbb03869..a690d4a25 100644 --- a/lapack/trtrs/trtrs_single.c +++ b/lapack/trtrs/trtrs_single.c @@ -41,28 +41,63 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { -#ifndef TRANS - LASWP_PLUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); - - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - TRSM_LNUN (args, range_m, range_n, sa, sb, 0); - } - +#ifndef UPLO +#ifndef DIAG +#ifndef DIAG + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } #else - - if (args -> n == 1){ - TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); - TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LTUN (args, range_m, range_n, sa, sb, 0); - TRSM_LTLU (args, range_m, range_n, sa, sb, 0); - } - - LASWP_MINUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } +#endif +#else +#ifndef DIAG + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } +#else + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } +#endif +#else +#ifndef DIAG +#ifndef DIAG + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } +#else + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } +#endif +#else +#ifndef DIAG + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } +#else + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + } +#endif #endif - return 0; } From 9f6984fe4bd5ca2c44df01c647462f54c7b84762 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Tue, 3 Sep 2019 14:45:43 -0400 Subject: [PATCH 701/935] add missing files --- lapack/trtrs/Makefile | 144 +++++++++++++++++++++++++++++++++ lapack/trtrs/trtrs_parallel.c | 111 +++++++++++++++++++++++++ lapack/trtrs/trtrs_single.c | 84 +++++++------------ lapack/trtrs/ztrtrs_parallel.c | 118 +++++++++++++++++++++++++++ lapack/trtrs/ztrtrs_single.c | 98 ++++++++++++++++++++++ 5 files changed, 499 insertions(+), 56 deletions(-) create mode 100644 lapack/trtrs/trtrs_parallel.c create mode 100644 lapack/trtrs/ztrtrs_parallel.c create mode 100644 lapack/trtrs/ztrtrs_single.c diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile index 400b8b653..f9faaf9b9 100644 --- a/lapack/trtrs/Makefile +++ b/lapack/trtrs/Makefile @@ -305,4 +305,148 @@ xtrtrs_LCU_single.$(SUFFIX) : xtrtrs_single.c xtrtrs_LCN_single.$(SUFFIX) : xtrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) +ctrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) + +ctrtrs_UNN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) + +ctrtrs_UTU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) + +ctrtrs_UTN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) + +ctrtrs_URU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) + +ctrtrs_URN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) + +ctrtrs_UCU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) + +ctrtrs_UCN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) + +ctrtrs_LNU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) + +ctrtrs_LNN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) + +ctrtrs_LTU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) + +ctrtrs_LTN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) + +ctrtrs_LRU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) + +ctrtrs_LRN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) + +ctrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) + +ctrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) + +ztrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) + +ztrtrs_UNN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) + +ztrtrs_UTU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) + +ztrtrs_UTN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) + +ztrtrs_URU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) + +ztrtrs_URN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) + +ztrtrs_UCU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) + +ztrtrs_UCN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) + +ztrtrs_LNU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) + +ztrtrs_LNN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) + +ztrtrs_LTU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) + +ztrtrs_LTN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) + +ztrtrs_LRU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) + +ztrtrs_LRN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) + +ztrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) + +ztrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) + +xtrtrs_UNU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) + +xtrtrs_UNN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) + +xtrtrs_UTU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) + +xtrtrs_UTN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) + +xtrtrs_URU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) + +xtrtrs_URN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) + +xtrtrs_UCU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) + +xtrtrs_UCN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) + +xtrtrs_LNU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) + +xtrtrs_LNN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) + +xtrtrs_LTU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) + +xtrtrs_LTN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) + +xtrtrs_LRU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) + +xtrtrs_LRN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) + +xtrtrs_LCU_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) + +xtrtrs_LCN_parallel.$(SUFFIX) : xtrtrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) + include ../../Makefile.tail diff --git a/lapack/trtrs/trtrs_parallel.c b/lapack/trtrs/trtrs_parallel.c new file mode 100644 index 000000000..52f42f693 --- /dev/null +++ b/lapack/trtrs/trtrs_parallel.c @@ -0,0 +1,111 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#if !defined(TRANS) && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNUU +#define TRSV TRSV_NUU +#elif !defined(TRANS) && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNUN +#define TRSV TRSV_NUN +#elif !defined(TRANS) && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNLU +#define TRSV TRSV_NLU +#elif !defined(TRANS) && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNLN +#define TRSV TRSV_NLN +#elif defined(TRANS) && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTUU +#define TRSV TRSV_TUU +#elif defined(TRANS) && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTUN +#define TRSV TRSV_TUN +#elif defined(TRANS) && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTLU +#define TRSV TRSV_TLU +#elif defined(TRANS) && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTLN +#define TRSV TRSV_TLN +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + TRSM (args, range_m, range_n, sa, sb, 0); + + return 0; +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + int mode; + +#ifndef TRANS + if (args -> n == 1){ + TRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } +#else + if (args -> n == 1){ + TRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#else + mode = BLAS_SINGLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } +#endif + + return 0; + } diff --git a/lapack/trtrs/trtrs_single.c b/lapack/trtrs/trtrs_single.c index a690d4a25..c82b81303 100644 --- a/lapack/trtrs/trtrs_single.c +++ b/lapack/trtrs/trtrs_single.c @@ -39,65 +39,37 @@ #include #include "common.h" +#if !defined(TRANS) && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNUU +#define TRSV TRSV_NUU +#elif !defined(TRANS) && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNUN +#define TRSV TRSV_NUN +#elif !defined(TRANS) && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNLU +#define TRSV TRSV_NLU +#elif !defined(TRANS) && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNLN +#define TRSV TRSV_NLN +#elif defined(TRANS) && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTUU +#define TRSV TRSV_TUU +#elif defined(TRANS) && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTUN +#define TRSV TRSV_TUN +#elif defined(TRANS) && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTLU +#define TRSV TRSV_TLU +#elif defined(TRANS) && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTLN +#define TRSV TRSV_TLN +#endif + blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { -#ifndef UPLO -#ifndef DIAG -#ifndef DIAG - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - } -#else - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - } -#endif -#else -#ifndef DIAG - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - } -#else - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - } -#endif -#else -#ifndef DIAG -#ifndef DIAG - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - } -#else - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - } -#endif -#else -#ifndef DIAG if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb); } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM (args, range_m, range_n, sa, sb, 0); } -#else - if (args -> n == 1){ - TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); - } else { - TRSM_LNLU (args, range_m, range_n, sa, sb, 0); - } -#endif -#endif return 0; } diff --git a/lapack/trtrs/ztrtrs_parallel.c b/lapack/trtrs/ztrtrs_parallel.c new file mode 100644 index 000000000..d5248f21b --- /dev/null +++ b/lapack/trtrs/ztrtrs_parallel.c @@ -0,0 +1,118 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#if TRANS == 1 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNUU +#define ZTRSV ZTRSV_NUU +#elif TRANS == 1 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNUN +#define ZTRSV ZTRSV_NUN +#elif TRANS == 1 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNLU +#define ZTRSV ZTRSV_NLU +#elif TRANS == 1 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNLN +#define ZTRSV ZTRSV_NLN +#elif TRANS == 2 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTUU +#define ZTRSV ZTRSV_TUU +#elif TRANS == 2 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTUN +#define ZTRSV ZTRSV_TUN +#elif TRANS == 2 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTLU +#define ZTRSV ZTRSV_TLU +#elif TRANS == 2 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTLN +#define ZTRSV ZTRSV_TLN +#elif TRANS == 3 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LRUU +#define ZTRSV ZTRSV_RUU +#elif TRANS == 3 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LRUN +#define ZTRSV ZTRSV_RUN +#elif TRANS == 3 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LRLU +#define ZTRSV ZTRSV_RLU +#elif TRANS == 3 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LRLN +#define ZTRSV ZTRSV_RLN +#elif TRANS == 4 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LCUU +#define ZTRSV ZTRSV_CUU +#elif TRANS == 4 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LCUN +#define ZTRSV ZTRSV_CUN +#elif TRANS == 4 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LCLU +#define ZTRSV ZTRSV_CLU +#elif TRANS == 4 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LCLN +#define ZTRSV ZTRSV_CLN +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + TRSM (args, range_m, range_n, sa, sb, 0); + return 0; +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + int mode; + + if (args -> n == 1){ + ZTRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } + + return 0; + } diff --git a/lapack/trtrs/ztrtrs_single.c b/lapack/trtrs/ztrtrs_single.c new file mode 100644 index 000000000..f39d72900 --- /dev/null +++ b/lapack/trtrs/ztrtrs_single.c @@ -0,0 +1,98 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#if TRANS == 1 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNUU +#define ZTRSV ZTRSV_NUU +#elif TRANS == 1 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNUN +#define ZTRSV ZTRSV_NUN +#elif TRANS == 1 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LNLU +#define ZTRSV ZTRSV_NLU +#elif TRANS == 1 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LNLN +#define ZTRSV ZTRSV_NLN +#elif TRANS == 2 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTUU +#define ZTRSV ZTRSV_TUU +#elif TRANS == 2 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTUN +#define ZTRSV ZTRSV_TUN +#elif TRANS == 2 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LTLU +#define ZTRSV ZTRSV_TLU +#elif TRANS == 2 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LTLN +#define ZTRSV ZTRSV_TLN +#elif TRANS == 3 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LRUU +#define ZTRSV ZTRSV_RUU +#elif TRANS == 3 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LRUN +#define ZTRSV ZTRSV_RUN +#elif TRANS == 3 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LRLU +#define ZTRSV ZTRSV_RLU +#elif TRANS == 3 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LRLN +#define ZTRSV ZTRSV_RLN +#elif TRANS == 4 && !defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LCUU +#define ZTRSV ZTRSV_CUU +#elif TRANS == 4 && !defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LCUN +#define ZTRSV ZTRSV_CUN +#elif TRANS == 4 && defined(UPLO) && !defined(DIAG) +#define TRSM TRSM_LCLU +#define ZTRSV ZTRSV_CLU +#elif TRANS == 4 && defined(UPLO) && defined(DIAG) +#define TRSM TRSM_LCLN +#define ZTRSV ZTRSV_CLN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + if (args -> n == 1){ + ZTRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM (args, range_m, range_n, sa, sb, 0); + } + return 0; } From 9b2f0323d6ecc36b16038c6a806a4e3106245d5a Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 6 Sep 2019 16:01:55 -0400 Subject: [PATCH 702/935] update Makefile --- interface/Makefile | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index f0577796d..2edf6387a 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -394,7 +394,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) SLAPACKOBJS = \ sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ - slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) + slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) #DLAPACKOBJS = \ @@ -405,14 +405,14 @@ SLAPACKOBJS = \ DLAPACKOBJS = \ dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ - dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) + dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) QLAPACKOBJS = \ qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ - qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ - + qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ + qtrtrs.$(SUFFIX) #CLAPACKOBJS = \ # cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ @@ -423,7 +423,7 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) #ZLAPACKOBJS = \ @@ -435,13 +435,14 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \ - xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ + xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ + xtrtrs.$(SUFFIX) ifneq ($(NO_LAPACK), 1) SBLASOBJS += $(SLAPACKOBJS) @@ -2043,6 +2044,24 @@ zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) +strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : trtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : ztrtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) From c7b5a459b6191ceb5ad244b3ff2d059a9751375e Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 6 Sep 2019 16:48:18 -0400 Subject: [PATCH 703/935] add missing defines and headers --- common_lapack.h | 146 ++++++++++++++++++++++++++++++++++++++++++++++++ common_macro.h | 48 ++++++++++++++++ 2 files changed, 194 insertions(+) diff --git a/common_lapack.h b/common_lapack.h index f6d1956fc..f9c36646a 100644 --- a/common_lapack.h +++ b/common_lapack.h @@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + #endif diff --git a/common_macro.h b/common_macro.h index e8a4a66ed..13bb85794 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2867,51 +2867,99 @@ typedef struct { #define TRTRS_UNN_SINGLE xtrtrs_UNN_single #define TRTRS_UTU_SINGLE xtrtrs_UTU_single #define TRTRS_UTN_SINGLE xtrtrs_UTN_single +#define TRTRS_URU_SINGLE xtrtrs_URU_single +#define TRTRS_URN_SINGLE xtrtrs_URN_single +#define TRTRS_UCU_SINGLE xtrtrs_UCU_single +#define TRTRS_UCN_SINGLE xtrtrs_UCN_single #define TRTRS_LNU_SINGLE xtrtrs_LNU_single #define TRTRS_LNN_SINGLE xtrtrs_LNN_single #define TRTRS_LTU_SINGLE xtrtrs_LTU_single #define TRTRS_LTN_SINGLE xtrtrs_LTN_single +#define TRTRS_LRU_SINGLE xtrtrs_LRU_single +#define TRTRS_LRN_SINGLE xtrtrs_LRN_single +#define TRTRS_LCU_SINGLE xtrtrs_LCU_single +#define TRTRS_LCN_SINGLE xtrtrs_LCN_single #define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel #define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel #define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel #define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel +#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel +#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel +#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel +#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel #define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel #define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel #define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel #define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel +#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel +#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel +#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel +#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel #elif defined(DOUBLE) #define TRTRS_UNU_SINGLE ztrtrs_UNU_single #define TRTRS_UNN_SINGLE ztrtrs_UNN_single #define TRTRS_UTU_SINGLE ztrtrs_UTU_single #define TRTRS_UTN_SINGLE ztrtrs_UTN_single +#define TRTRS_URU_SINGLE ztrtrs_URU_single +#define TRTRS_URN_SINGLE ztrtrs_URN_single +#define TRTRS_UCU_SINGLE ztrtrs_UCU_single +#define TRTRS_UCN_SINGLE ztrtrs_UCN_single #define TRTRS_LNU_SINGLE ztrtrs_LNU_single #define TRTRS_LNN_SINGLE ztrtrs_LNN_single #define TRTRS_LTU_SINGLE ztrtrs_LTU_single #define TRTRS_LTN_SINGLE ztrtrs_LTN_single +#define TRTRS_LRU_SINGLE ztrtrs_LRU_single +#define TRTRS_LRN_SINGLE ztrtrs_LRN_single +#define TRTRS_LCU_SINGLE ztrtrs_LCU_single +#define TRTRS_LCN_SINGLE ztrtrs_LCN_single #define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel #define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel #define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel #define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel +#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel +#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel +#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel +#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel #define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel #define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel #define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel #define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel +#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel +#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel +#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel +#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel #else #define TRTRS_UNU_SINGLE ctrtrs_UNU_single #define TRTRS_UNN_SINGLE ctrtrs_UNN_single #define TRTRS_UTU_SINGLE ctrtrs_UTU_single #define TRTRS_UTN_SINGLE ctrtrs_UTN_single +#define TRTRS_URU_SINGLE ctrtrs_URU_single +#define TRTRS_URN_SINGLE ctrtrs_URN_single +#define TRTRS_UCU_SINGLE ctrtrs_UCU_single +#define TRTRS_UCN_SINGLE ctrtrs_UCN_single #define TRTRS_LNU_SINGLE ctrtrs_LNU_single #define TRTRS_LNN_SINGLE ctrtrs_LNN_single #define TRTRS_LTU_SINGLE ctrtrs_LTU_single #define TRTRS_LTN_SINGLE ctrtrs_LTN_single +#define TRTRS_LRU_SINGLE ctrtrs_LRU_single +#define TRTRS_LRN_SINGLE ctrtrs_LRN_single +#define TRTRS_LCU_SINGLE ctrtrs_LCU_single +#define TRTRS_LCN_SINGLE ctrtrs_LCN_single #define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel #define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel #define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel #define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel +#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel +#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel +#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel +#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel #define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel #define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel #define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel #define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel +#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel +#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel +#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel +#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel #endif #endif From af9ac0898af4357cf66d34215d9711df64f6e858 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 6 Sep 2019 16:49:12 -0400 Subject: [PATCH 704/935] fix Makefile --- interface/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 2edf6387a..3f0dcca28 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -2032,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c +qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c @@ -2041,7 +2041,7 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c +xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c @@ -2050,7 +2050,7 @@ strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : trtrs.c +qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c $(CC) -c $(CFLAGS) $< -o $(@F) ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c @@ -2059,7 +2059,7 @@ ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c $(CC) -c $(CFLAGS) $< -o $(@F) -xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : ztrtrs.c +xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c $(CC) -c $(CFLAGS) $< -o $(@F) sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c From 7ec7b999a543a5480db1f76dcb413deee6c50e2d Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 6 Sep 2019 16:49:27 -0400 Subject: [PATCH 705/935] add missing file --- interface/lapack/ztrtrs.c | 171 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 interface/lapack/ztrtrs.c diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c new file mode 100644 index 000000000..4cd423069 --- /dev/null +++ b/interface/lapack/ztrtrs.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRTRS" +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRTRS" +#else +#define ERROR_NAME "CTRTRS" +#endif + +static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_URU_SINGLE, TRTRS_URN_SINGLE, TRTRS_UCU_SINGLE, TRTRS_UCN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE, TRTRS_LRU_SINGLE, TRTRS_LRN_SINGLE, TRTRS_LCU_SINGLE, TRTRS_LCN_SINGLE, +}; + +#ifdef SMP +static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_URU_PARALLEL, TRTRS_URN_PARALLEL, TRTRS_UCU_PARALLEL, TRTRS_UCN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL, TRTRS_LRU_PARALLEL, TRTRS_LRN_PARALLEL, TRTRS_LCU_PARALLEL, TRTRS_LCN_PARALLEL, +}; +#endif + +int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, blasint *Info){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blas_arg_t args; + + blasint info; + int uplo, trans, diag; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + + info = 0; + + TOUPPER(trans_arg); + trans = -1; + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + if (args.ldb < MAX(1, args.m)) info = 7; + if (args.lda < MAX(1, args.m)) info = 9; + if (args.n < 0) info = 5; + if (args.m < 0) info = 4; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + if (diag < 0) info = 3; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = 0; + + if (args.m == 0 || args.n == 0) return 0; + + if (diag) { + if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.n, args.a, args.lda + 1); + return 0; + } + } + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + (trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + (trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); + + IDEBUG_END; + + return 0; + +} From 4b21b646ea49c9e00490bb29f978d1a91a4a1192 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 6 Sep 2019 17:19:40 -0400 Subject: [PATCH 706/935] turn on optimized code --- lapack-netlib/SRC/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 87a8f51e4..1c276aff6 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -507,22 +507,22 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o + slauum.o strti2.o strtri.o strtrs.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) From 5997b6b491a7fb9b0184f4edfc1c76d16de3affa Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sat, 7 Sep 2019 22:06:27 -0400 Subject: [PATCH 707/935] bugfix --- interface/lapack/ztrtrs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c index 4cd423069..0536fc5d3 100644 --- a/interface/lapack/ztrtrs.c +++ b/interface/lapack/ztrtrs.c @@ -150,11 +150,11 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * if (args.nthreads == 1) { #endif - (trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + (trtrs_single[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { - (trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + (trtrs_parallel[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0); } #endif From f2becb777a9640225a6ef89b939fb8b0bdbbbc77 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Mon, 9 Sep 2019 11:36:50 -0400 Subject: [PATCH 708/935] fix Makefile --- lapack/trtrs/Makefile | 64 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile index f9faaf9b9..587d94e3d 100644 --- a/lapack/trtrs/Makefile +++ b/lapack/trtrs/Makefile @@ -257,52 +257,52 @@ ztrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c ztrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) -xtrtrs_UNU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_UNU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) -xtrtrs_UNN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_UNN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) -xtrtrs_UTU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_UTU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) -xtrtrs_UTN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_UTN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) -xtrtrs_URU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_URU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) -xtrtrs_URN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_URN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) -xtrtrs_UCU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_UCU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) -xtrtrs_UCN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_UCN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) -xtrtrs_LNU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LNU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) -xtrtrs_LNN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LNN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) -xtrtrs_LTU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LTU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) -xtrtrs_LTN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LTN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) -xtrtrs_LRU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LRU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) -xtrtrs_LRN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LRN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) -xtrtrs_LCU_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) -xtrtrs_LCN_single.$(SUFFIX) : xtrtrs_single.c +xtrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) ctrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c @@ -401,52 +401,52 @@ ztrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c ztrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) -xtrtrs_UNU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F) -xtrtrs_UNN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_UNN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F) -xtrtrs_UTU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_UTU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F) -xtrtrs_UTN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_UTN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F) -xtrtrs_URU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_URU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F) -xtrtrs_URN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_URN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F) -xtrtrs_UCU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_UCU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F) -xtrtrs_UCN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_UCN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F) -xtrtrs_LNU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LNU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F) -xtrtrs_LNN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LNN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F) -xtrtrs_LTU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LTU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F) -xtrtrs_LTN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LTN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F) -xtrtrs_LRU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LRU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F) -xtrtrs_LRN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LRN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F) -xtrtrs_LCU_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F) -xtrtrs_LCN_parallel.$(SUFFIX) : xtrtrs_parallel.c +xtrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F) include ../../Makefile.tail From eb45eb6942b9aa35970f4f58ce268130891a79c5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Sep 2019 08:27:06 +0200 Subject: [PATCH 709/935] Fix C compiler handling and BINARY=32 mode in CMAKE builds (#2248) * Fix compiler identification and option setting * Handle BINARY=32 option on X86_64 * Add xGEMM3M unroll parameters for crossbuild-target CORE2 * Replace bogus mingw64/32bit CI job with actual 32bit build mingw64 is not multilib-capable, so using an x86_64-mingw with BINARY=32 in the CI was not going to work anyway (but build passed while BINARY=32 was ignored). --- appveyor.yml | 7 ++++--- cmake/cc.cmake | 10 +++++----- cmake/prebuild.cmake | 4 ++++ cmake/system_check.cmake | 31 +++++++++++++++++++++++++++---- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 2f9cc7b0b..1936059d5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -38,7 +38,8 @@ environment: - COMPILER: MinGW64-gcc-7.2.0-mingw DYNAMIC_ARCH: OFF WITH_FORTRAN: ignore - - COMPILER: MinGW64-gcc-7.2.0 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-6.3.0-32 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 COMPILER: MinGW-gcc-5.3.0 WITH_FORTRAN: ignore @@ -62,10 +63,10 @@ before_build: - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 98f9298f8..37da0d6ed 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -3,7 +3,7 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets C related variables. -if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") +if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang") set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") set(COMMON_PROF "${COMMON_PROF} -fno-inline") @@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR endif () endif () -if (${CMAKE_C_COMPILER} STREQUAL "PGI") +if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI") if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64") else () @@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI") endif () endif () -if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") +if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE") if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -m64") else () @@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") endif () endif () -if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") +if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64") if (MIPS64) @@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") endif () endif () -if (${CMAKE_C_COMPILER} STREQUAL "SUN") +if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN") set(CCOMMON_OPT "${CCOMMON_OPT} -w") if (X86) set(CCOMMON_OPT "${CCOMMON_OPT} -m32") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index da185db5a..086df1943 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -133,6 +133,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 2) + set(CGEMM3M_UNROLL_M 8) + set(CGEMM3M_UNROLL_N 4) + set(ZGEMM3M_UNROLL_M 4) + set(ZGEMM3M_UNROLL_N 4) elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 610f689e0..c4a553c5a 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -39,10 +39,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - set(X86_64 1) + if (NOT BINARY) + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + set(X86_64 1) + else() + set(X86 1) + endif() else() - set(X86 1) + if (${BINARY} EQUAL "64") + set(X86_64 1) + else () + set(X86 1) + endif() endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) @@ -54,6 +62,22 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") else() set(ARM 1) endif() +elseif (${CMAKE_CROSSCOMPILING}) + if (${TARGET} STREQUAL "CORE2") + if (NOT BINARY) + set(X86 1) + elseif (${BINARY} EQUAL "64") + set(X86_64 1) + else () + set(X86 1) + endif() + elseif (${TARGET} STREQUAL "ARMV7") + set(ARM 1) + else() + set(ARM64 1) + endif () +else () + message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"") endif() if (X86_64) @@ -92,4 +116,3 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() file(REMOVE "avx512.tmp" "avx512.o") endif() - From 459bb9291db0a9a97718cb312c77f8ea3dba7c60 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Tue, 10 Sep 2019 17:10:33 -0400 Subject: [PATCH 710/935] fix error codes --- interface/lapack/trtrs.c | 6 +++--- interface/lapack/ztrtrs.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/interface/lapack/trtrs.c b/interface/lapack/trtrs.c index 261b07ec6..96dde1618 100644 --- a/interface/lapack/trtrs.c +++ b/interface/lapack/trtrs.c @@ -103,8 +103,8 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * if (diag_arg == 'U') diag = 0; if (diag_arg == 'N') diag = 1; - if (args.ldb < MAX(1, args.m)) info = 7; - if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; if (args.n < 0) info = 5; if (args.m < 0) info = 4; if (trans < 0) info = 2; @@ -112,7 +112,7 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * if (diag < 0) info = 3; if (info != 0) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c index 0536fc5d3..4ee51435b 100644 --- a/interface/lapack/ztrtrs.c +++ b/interface/lapack/ztrtrs.c @@ -103,8 +103,8 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * if (diag_arg == 'U') diag = 0; if (diag_arg == 'N') diag = 1; - if (args.ldb < MAX(1, args.m)) info = 7; - if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; if (args.n < 0) info = 5; if (args.m < 0) info = 4; if (trans < 0) info = 2; @@ -112,7 +112,7 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * if (diag < 0) info = 3; if (info != 0) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } From 6cb47ea3f0a6d4263cca3d2649b8512a6b53192d Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Tue, 10 Sep 2019 17:11:01 -0400 Subject: [PATCH 711/935] fix Makefile --- lapack/trtrs/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile index 587d94e3d..a3b8f4322 100644 --- a/lapack/trtrs/Makefile +++ b/lapack/trtrs/Makefile @@ -36,7 +36,7 @@ strtrs_LNN_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) strtrs_LTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F) strtrs_LTN_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) @@ -60,7 +60,7 @@ strtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) strtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F) strtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) @@ -84,7 +84,7 @@ dtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) dtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F) dtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) @@ -108,7 +108,7 @@ dtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) dtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F) dtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) @@ -132,7 +132,7 @@ qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c - $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F) qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) @@ -156,7 +156,7 @@ qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F) qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c - $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F) qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F) From 5d6525c87cfc6a7ba69e24eefb8b053d480bc97b Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Tue, 10 Sep 2019 17:30:57 -0400 Subject: [PATCH 712/935] more bugfix --- interface/lapack/trtrs.c | 6 +++--- interface/lapack/ztrtrs.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/interface/lapack/trtrs.c b/interface/lapack/trtrs.c index 96dde1618..54fbe8394 100644 --- a/interface/lapack/trtrs.c +++ b/interface/lapack/trtrs.c @@ -122,11 +122,11 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * *Info = 0; - if (args.m == 0 || args.n == 0) return 0; + if (args.m == 0) return 0; if (diag) { - if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { - *Info = IAMIN_K(args.n, args.a, args.lda + 1); + if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.m, args.a, args.lda + 1); return 0; } } diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c index 4ee51435b..7f1bd9af4 100644 --- a/interface/lapack/ztrtrs.c +++ b/interface/lapack/ztrtrs.c @@ -122,11 +122,11 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * *Info = 0; - if (args.m == 0 || args.n == 0) return 0; + if (args.m == 0) return 0; if (diag) { - if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { - *Info = IAMIN_K(args.n, args.a, args.lda + 1); + if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.m, args.a, args.lda + 1); return 0; } } From 2463938879a93ff8b8207b112d03bbeb4cbabae2 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Wed, 11 Sep 2019 10:33:35 -0400 Subject: [PATCH 713/935] fix error message --- interface/lapack/gesv.c | 14 +++++++------- interface/lapack/getf2.c | 2 +- interface/lapack/getrf.c | 2 +- interface/lapack/getrs.c | 2 +- interface/lapack/lauu2.c | 2 +- interface/lapack/lauum.c | 2 +- interface/lapack/potf2.c | 2 +- interface/lapack/potrf.c | 2 +- interface/lapack/potri.c | 2 +- interface/lapack/trti2.c | 2 +- interface/lapack/trtri.c | 2 +- interface/lapack/zgetf2.c | 2 +- interface/lapack/zgetrf.c | 2 +- interface/lapack/zgetrs.c | 2 +- interface/lapack/zlauu2.c | 2 +- interface/lapack/zpotf2.c | 2 +- interface/lapack/zpotrf.c | 2 +- interface/lapack/zpotri.c | 2 +- interface/lapack/ztrti2.c | 2 +- interface/lapack/ztrtri.c | 2 +- 20 files changed, 26 insertions(+), 26 deletions(-) diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 721da970d..175350329 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -44,19 +44,19 @@ #ifndef COMPLEX #ifdef XDOUBLE -#define ERROR_NAME "QGESV " +#define ERROR_NAME "QGESV" #elif defined(DOUBLE) -#define ERROR_NAME "DGESV " +#define ERROR_NAME "DGESV" #else -#define ERROR_NAME "SGESV " +#define ERROR_NAME "SGESV" #endif #else #ifdef XDOUBLE -#define ERROR_NAME "XGESV " +#define ERROR_NAME "XGESV" #elif defined(DOUBLE) -#define ERROR_NAME "ZGESV " +#define ERROR_NAME "ZGESV" #else -#define ERROR_NAME "CGESV " +#define ERROR_NAME "CGESV" #endif #endif @@ -89,7 +89,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, if (args.m < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/getf2.c b/interface/lapack/getf2.c index 3e66c0403..8506feca9 100644 --- a/interface/lapack/getf2.c +++ b/interface/lapack/getf2.c @@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c index 44a92ddc4..02bb124b3 100644 --- a/interface/lapack/getrf.c +++ b/interface/lapack/getrf.c @@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/getrs.c b/interface/lapack/getrs.c index 1b8c83aca..c2a9eb882 100644 --- a/interface/lapack/getrs.c +++ b/interface/lapack/getrs.c @@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, if (trans < 0) info = 1; if (info != 0) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); return 0; } diff --git a/interface/lapack/lauu2.c b/interface/lapack/lauu2.c index 3599a4791..e581e3c15 100644 --- a/interface/lapack/lauu2.c +++ b/interface/lapack/lauu2.c @@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/lauum.c b/interface/lapack/lauum.c index 2c49eb0b0..70f6a0ec5 100644 --- a/interface/lapack/lauum.c +++ b/interface/lapack/lauum.c @@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/potf2.c b/interface/lapack/potf2.c index 837192265..1537b6ee4 100644 --- a/interface/lapack/potf2.c +++ b/interface/lapack/potf2.c @@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c index 092272225..dbd55f62f 100644 --- a/interface/lapack/potrf.c +++ b/interface/lapack/potrf.c @@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c index d6230621f..2c0c64b6f 100644 --- a/interface/lapack/potri.c +++ b/interface/lapack/potri.c @@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/trti2.c b/interface/lapack/trti2.c index 42c4c4815..47f04f06f 100644 --- a/interface/lapack/trti2.c +++ b/interface/lapack/trti2.c @@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/trtri.c b/interface/lapack/trtri.c index 6724a678a..028529389 100644 --- a/interface/lapack/trtri.c +++ b/interface/lapack/trtri.c @@ -99,7 +99,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/zgetf2.c b/interface/lapack/zgetf2.c index 59ec4874e..68b9a7e4b 100644 --- a/interface/lapack/zgetf2.c +++ b/interface/lapack/zgetf2.c @@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c index 5031f587b..7f8db94f6 100644 --- a/interface/lapack/zgetrf.c +++ b/interface/lapack/zgetrf.c @@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/zgetrs.c b/interface/lapack/zgetrs.c index 54d4b0905..0add909ca 100644 --- a/interface/lapack/zgetrs.c +++ b/interface/lapack/zgetrs.c @@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, if (trans < 0) info = 1; if (info != 0) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); return 0; } diff --git a/interface/lapack/zlauu2.c b/interface/lapack/zlauu2.c index b0698ef2e..ae972543c 100644 --- a/interface/lapack/zlauu2.c +++ b/interface/lapack/zlauu2.c @@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/zpotf2.c b/interface/lapack/zpotf2.c index 27ee0891a..c74b66728 100644 --- a/interface/lapack/zpotf2.c +++ b/interface/lapack/zpotf2.c @@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c index 8cd3980d5..c4cd99bf6 100644 --- a/interface/lapack/zpotrf.c +++ b/interface/lapack/zpotrf.c @@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c index 7c72a7e62..8da211683 100644 --- a/interface/lapack/zpotri.c +++ b/interface/lapack/zpotri.c @@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/ztrti2.c b/interface/lapack/ztrti2.c index a25476677..cb9c0d557 100644 --- a/interface/lapack/ztrti2.c +++ b/interface/lapack/ztrti2.c @@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } diff --git a/interface/lapack/ztrtri.c b/interface/lapack/ztrtri.c index b3ce85b9f..dda4a9e4b 100644 --- a/interface/lapack/ztrtri.c +++ b/interface/lapack/ztrtri.c @@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); *Info = - info; return 0; } From e7c4d6705a41910240dd19b9e7082a422563bf15 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 17 Sep 2019 18:56:04 +0200 Subject: [PATCH 714/935] Revert #2051 and replace with a better fix (#2261) * Revert #2051 and add a better fix for TARGET=generic with DYNAMIC_ARCH fixes #2257 without breaking #2048 again --- kernel/Makefile.L3 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f83def47b..7998c135a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,9 +24,11 @@ ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif -ifeq ($(CORE), GENERIC) +ifneq ($(DYNAMIC_ARCH), 1) +ifeq ($(TARGET), GENERIC) USE_TRMM = 1 endif +endif ifeq ($(CORE), HASWELL) USE_TRMM = 1 From bfa2cc7d6411f16e1889bb7541159086949448c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Sep 2019 10:29:35 +0200 Subject: [PATCH 715/935] Restore ppc64 CI job and remove the travis_wait that caused the problem with it --- .travis.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 27ecba6c8..2b1b99b26 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ matrix: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - set -e - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -25,14 +25,14 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" - # - <<: *test-ubuntu - # os: linux-ppc64le - # before_script: - # - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" - # env: - # # for matrix annotation only - # - TARGET_BOX=PPC64LE_LINUX - # - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" - <<: *test-ubuntu env: From 673e5a049585f307fec09f38e8774d3ef902d239 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Sep 2019 22:35:22 +0200 Subject: [PATCH 716/935] Replace several POWER8/9 C kernels with their gcc7-generated assembly versions (#2263) * Add gcc7-generated assembly files for POWER8/9 isa/ica-min/max and POWER9 caxpy To work around internal compiler errors encountered when compiling the original C source with gcc 4 and 5, and wrong code generated by gcc 8.3.0 * Use gcc-generated assembly instead of original C sources to work around internal compiler errors encountered with gcc 4.8/5.4 and wrong code generation by gcc 8.3 * Use gcc-generated assembly instead of the original C source to work around internal compiler errors encountered with gcc 4.8 and 5.4, and wrong code generation by gcc 8.3 * Add gcc7-generated assembler version of caxpy for power8 to work around wrong code generated by gcc 8.3 * Handle CONJ define for caxpyc * Handle CONJ define for caxpyc * Add gcc7-generated assembly cdot for POWER9 * Use prebuilt assembly for POWER9 cdot created with gcc 7.3.1 to work around ICE in older gcc versions * Exclude POWER9 from DYNAMIC_ARCH when gcc versions is lower than 6 * Update Makefile.system * Use PROLOGUE macro to ensure correct function name for DYNAMIC_ARCH * Disable POWER9 with old gcc versions --- Makefile.system | 14 +- driver/others/dynamic_power.c | 8 + kernel/power/KERNEL.POWER8 | 10 +- kernel/power/KERNEL.POWER9 | 12 +- kernel/power/caxpy_power8.S | 574 ++++++++++++++++++++++++++++++++++ kernel/power/caxpy_power9.S | 538 +++++++++++++++++++++++++++++++ kernel/power/cdot_power9.S | 242 ++++++++++++++ kernel/power/icamax_power8.S | 458 +++++++++++++++++++++++++++ kernel/power/icamax_power9.S | 387 +++++++++++++++++++++++ kernel/power/icamin_power8.S | 454 +++++++++++++++++++++++++++ kernel/power/icamin_power9.S | 385 +++++++++++++++++++++++ kernel/power/isamax_power8.S | 434 +++++++++++++++++++++++++ kernel/power/isamax_power9.S | 397 +++++++++++++++++++++++ kernel/power/isamin_power8.S | 417 ++++++++++++++++++++++++ kernel/power/isamin_power9.S | 382 ++++++++++++++++++++++ 15 files changed, 4699 insertions(+), 13 deletions(-) create mode 100644 kernel/power/caxpy_power8.S create mode 100644 kernel/power/caxpy_power9.S create mode 100644 kernel/power/cdot_power9.S create mode 100644 kernel/power/icamax_power8.S create mode 100644 kernel/power/icamax_power9.S create mode 100644 kernel/power/icamin_power8.S create mode 100644 kernel/power/icamin_power9.S create mode 100644 kernel/power/isamax_power8.S create mode 100644 kernel/power/isamax_power9.S create mode 100644 kernel/power/isamin_power8.S create mode 100644 kernel/power/isamin_power9.S diff --git a/Makefile.system b/Makefile.system index 2cf1322a9..8843d0ad3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -322,12 +322,13 @@ CCOMMON_OPT += -DMS_ABI endif ifeq ($(C_COMPILER), GCC) -#Test for supporting MS_ABI +#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) -# GCC Majar version > 4 +# GCC Major version > 4 # It is compatible with MSVC ABI. CCOMMON_OPT += -DMS_ABI endif @@ -554,8 +555,17 @@ endif ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 +ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 endif +ifeq ($(C_COMPILER), GCC) +ifeq ($(GCCVERSIONGT5), 1) +DYNAMIC_CORE += POWER9 +else +$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) +endif +endif +endif # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 0c4a87a5e..1dec5f4b3 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -3,7 +3,9 @@ extern gotoblas_t gotoblas_POWER6; extern gotoblas_t gotoblas_POWER8; +#if (!defined C_GCC) || (GCC_VERSION >= 60000) extern gotoblas_t gotoblas_POWER9; +#endif extern void openblas_warning(int verbose, const char *msg); @@ -19,7 +21,9 @@ static char *corename[] = { char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER6) return corename[1]; if (gotoblas == &gotoblas_POWER8) return corename[2]; +#if (!defined C_GCC) || (GCC_VERSION >= 60000) if (gotoblas == &gotoblas_POWER9) return corename[3]; +#endif return corename[0]; } @@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER6; if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; +#if (!defined C_GCC) || (GCC_VERSION >= 60000) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; +#endif return NULL; } @@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) { { case 1: return (&gotoblas_POWER6); case 2: return (&gotoblas_POWER8); +#if (!defined C_GCC) || (GCC_VERSION >= 60000) case 3: return (&gotoblas_POWER9); +#endif default: return NULL; } snprintf(message, 128, "Core not found: %s\n", coretype); diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 43f004fbb..c08f3fb00 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -ISAMAXKERNEL = isamax.c +ISAMAXKERNEL = isamax_power8.S IDAMAXKERNEL = idamax.c -ICAMAXKERNEL = icamax.c +ICAMAXKERNEL = icamax_power8.S IZAMAXKERNEL = izamax.c # -ISAMINKERNEL = isamin.c +ISAMINKERNEL = isamin_power8.S IDAMINKERNEL = idamin.c -ICAMINKERNEL = icamin.c +ICAMINKERNEL = icamin_power8.S IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -CAXPYKERNEL = caxpy.c +CAXPYKERNEL = caxpy_power8.S ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index a570a903a..2ed843fff 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -ISAMAXKERNEL = isamax.c +ISAMAXKERNEL = isamax_power9.S IDAMAXKERNEL = idamax.c -ICAMAXKERNEL = icamax.c +ICAMAXKERNEL = icamax_power9.S IZAMAXKERNEL = izamax.c # -ISAMINKERNEL = isamin.c +ISAMINKERNEL = isamin_power9.S IDAMINKERNEL = idamin.c -ICAMINKERNEL = icamin.c +ICAMINKERNEL = icamin_power9.S IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -CAXPYKERNEL = caxpy.c +CAXPYKERNEL = caxpy_power9.S ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c @@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = cdot.c +CDOTKERNEL = cdot_power9.S ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S new file mode 100644 index 000000000..09a423571 --- /dev/null +++ b/kernel/power/caxpy_power8.S @@ -0,0 +1,574 @@ +#define ASSEMBLER +#include "common.h" +/* + .file "caxpy.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl caxpy_k + .type caxpy_k, @function +*/ + + PROLOGUE + +caxpy_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry caxpy_k,.-caxpy_k + mr. 7,3 + ble 0,.L33 + cmpdi 7,9,1 + beq 7,.L41 +.L3: + mtctr 7 + ld 7,96(1) + sldi 9,9,3 + sldi 7,7,3 + .p2align 4,,15 +.L14: + lfs 10,4(8) + lfs 11,0(8) + lfs 12,0(10) + lfs 0,4(10) + fmuls 10,2,10 +#ifdef CONJ + fmsubs 11,11,1,10 +#else + fmadds 11,11,1,10 +#endif + fadds 12,12,11 + stfs 12,0(10) + lfs 11,0(8) + lfs 12,4(8) + add 8,8,9 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfs 0,4(10) + add 10,10,7 + bdnz .L14 +.L33: + li 3,0 + blr + .p2align 4,,15 +.L41: + ld 6,96(1) + cmpdi 7,6,1 + bne 7,.L3 + rldicr. 4,7,0,59 + std 31,-8(1) + li 11,0 + bne 0,.L42 +.L4: + addi 6,11,8 + subf 0,4,7 + sldi 6,6,2 + addi 9,6,-32 + add 5,10,6 + add 3,8,9 + add 6,8,6 + subfc 5,5,3 + add 9,10,9 + subfe 5,5,5 + subfc 6,6,9 + subfe 31,31,31 + addi 6,5,1 + addi 5,31,1 + or 6,6,5 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + sradi 6,4,63 + srdi 5,7,63 + subfc 31,7,4 + adde 6,5,6 + subfic 31,0,3 + subfe 31,31,31 + xori 6,6,0x1 + neg 31,31 + and 6,6,31 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + cmpd 7,4,7 + li 6,1 + blt 7,.L43 +.L9: + addi 0,7,-1 + subf 0,4,0 + subfic 0,0,3 + subfe 31,31,31 + addi 0,31,1 + rlwinm 0,0,0,0xff + cmpwi 7,0,0 + bne 7,.L10 + sradi 0,4,63 + subfc 31,7,4 + adde 5,5,0 + rlwinm 5,5,0,0xff + cmpwi 7,5,0 + bne 7,.L10 + addi 0,6,-1 + addis 31,2,.LC3@toc@ha + std 30,-16(1) + xscvdpspn 12,1 + xscvdpspn 11,2 + srdi. 30,0,2 + addis 6,2,.LC2@toc@ha + addi 6,6,.LC2@toc@l + mtctr 30 + addi 31,31,.LC3@toc@l + lxvd2x 42,0,6 + li 5,16 + li 6,0 + lxvd2x 41,0,31 + xxspltw 12,12,0 + xxspltw 11,11,0 + xxpermdi 42,42,42,2 + xxpermdi 41,41,41,2 + beq 0,.L44 + .p2align 4,,15 +.L11: +#ifdef CONJ + lxvd2x 44,3,6 + lxvd2x 45,3,5 + lxvd2x 33,9,6 + lxvd2x 0,9,5 + xxpermdi 44,44,44,2 + xxpermdi 45,45,45,2 + xxpermdi 32,33,33,2 + xxpermdi 33,0,0,2 + vperm 11,13,12,10 + vperm 13,13,12,9 + vperm 12,1,0,10 + vperm 1,1,0,9 + xvmulsp 0,11,43 + xvmulsp 32,11,45 + xvmsubmsp 45,12,0 + xvmaddasp 32,12,43 + xvaddsp 44,32,44 + xvsubsp 32,33,45 + vmrglw 1,0,12 + vmrghw 0,0,12 +#else + lxvd2x 45,3,6 + lxvd2x 33,3,5 + lxvd2x 43,9,6 + lxvd2x 0,9,5 + xxpermdi 45,45,45,2 + xxpermdi 33,33,33,2 + xxpermdi 32,43,43,2 + xxpermdi 43,0,0,2 + vperm 12,1,13,10 + vperm 1,1,13,9 + vperm 13,11,0,10 + vperm 11,11,0,9 + xvmulsp 0,11,44 + xvmulsp 32,11,33 + xvmaddmsp 33,12,0 + xvmsubasp 32,12,44 + xvaddsp 45,32,45 + xvaddsp 32,33,43 + vmrglw 1,0,13 + vmrghw 0,0,13 +#endif + xxpermdi 0,33,33,2 + xxpermdi 32,32,32,2 + stxvd2x 0,9,6 + addi 6,6,32 + stxvd2x 32,9,5 + addi 5,5,32 + bdnz .L11 + rldicr 0,0,0,61 + ld 30,-16(1) + sldi 9,0,1 + add 4,4,0 + add 11,11,9 +.L10: + sldi 6,11,2 + addi 9,4,1 + addi 5,6,4 + cmpd 7,7,9 + lfsx 12,8,6 + lfsx 0,10,6 + addi 9,11,2 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,12,1,11 +#else + fmsubs 12,12,1,11 +#endif + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfsx 0,10,5 + ble 7,.L39 + sldi 9,9,2 + addi 6,4,2 + addi 5,9,4 + cmpd 7,7,6 + lfsx 12,8,9 + lfsx 0,10,9 + addi 6,11,4 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 + fmsubs 12,1,12,11 + fsubs 0,0,12 + stfsx 0,10,5 + ble 7,.L39 + sldi 6,6,2 + addi 4,4,3 + addi 5,6,4 + cmpd 7,7,4 + lfsx 12,8,6 + lfsx 0,10,6 + addi 9,11,6 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfsx 0,10,5 + ble 7,.L39 + sldi 9,9,2 + ld 31,-8(1) + addi 7,9,4 + lfsx 12,8,9 + lfsx 0,10,9 + lfsx 11,8,7 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,7 + lfsx 0,10,7 + fmuls 2,2,11 +#ifdef CONJ + fmsubs 1,1,12,2 + fsubs 1,0,1 +#else + fmadds 1,1,12,2 + fadds 1,0,1 +#endif + stfsx 1,10,7 + b .L33 +.L43: + mr 6,0 + b .L9 +.L7: + addi 10,4,1 + cmpd 7,10,7 + subf 10,4,7 + mtctr 10 + bgt 7,.L26 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,7,10 + beq 7,.L26 + .p2align 4,,15 +.L13: + lfs 10,4(3) + lfs 11,0(3) + addi 9,9,8 + addi 3,3,8 + lfs 12,-8(9) + lfs 0,-4(9) + fmuls 10,2,10 +#ifdef CONJ + fmadds 11,1,11,10 +#else + fmsubs 11,1,11,10 +#endif + fadds 12,12,11 + stfs 12,-8(9) + lfs 11,-8(3) + lfs 12,-4(3) + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfs 0,-4(9) + bdnz .L13 +.L39: + ld 31,-8(1) + b .L33 +.L42: +#ifdef CONJ + fneg 0,1 + xxpermdi 32,1,1,0 + addis 9,2,.LANCHOR0@toc@ha + std 28,-32(1) + sradi. 28,4,1 + addi 9,9,.LANCHOR0@toc@l + xscvdpspn 5,2 + xvcvdpsp 32,32 + lxvd2x 12,0,9 + xxpermdi 39,0,0,0 + xxspltw 5,5,0 + xvcvdpsp 39,39 +#else + fneg 0,2 + xxpermdi 39,2,2,0 + addis 9,2,.LANCHOR0@toc@ha + std 28,-32(1) + sradi. 28,4,1 + addi 9,9,.LANCHOR0@toc@l + xscvdpspn 5,1 + xvcvdpsp 39,39 + lxvd2x 12,0,9 + xxpermdi 32,0,0,0 + xxspltw 5,5,0 + xvcvdpsp 32,32 +#endif + xxpermdi 12,12,12,2 + vmrgew 7,7,0 + beq 0,.L5 + xxlnor 38,12,12 + std 29,-24(1) + std 30,-16(1) + mr 6,8 + mr 9,10 + li 29,0 + li 30,16 + li 31,32 + li 12,48 + li 0,64 + li 11,80 + li 3,96 + li 5,112 + .p2align 4,,15 +.L6: + lxvd2x 6,0,9 + lxvd2x 40,0,6 + addi 29,29,8 + lxvd2x 41,6,30 + lxvd2x 42,6,31 + cmpd 7,28,29 + lxvd2x 43,6,12 + lxvd2x 44,6,0 + lxvd2x 45,6,11 + lxvd2x 33,6,3 + lxvd2x 32,6,5 + lxvd2x 7,9,30 + addi 6,6,128 + lxvd2x 8,9,31 + lxvd2x 9,9,12 + xxpermdi 40,40,40,2 + xxpermdi 6,6,6,2 + lxvd2x 10,9,0 + lxvd2x 11,9,11 + xxpermdi 41,41,41,2 + xxpermdi 42,42,42,2 + lxvd2x 12,9,3 + lxvd2x 0,9,5 + xxpermdi 43,43,43,2 + xxpermdi 44,44,44,2 + xxpermdi 45,45,45,2 + xxpermdi 33,33,33,2 + xxpermdi 32,32,32,2 + xxpermdi 7,7,7,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + xxpermdi 10,10,10,2 + xxpermdi 11,11,11,2 + xxpermdi 12,12,12,2 + xxpermdi 0,0,0,2 +#ifndef CONJ + xvmaddasp 6,5,40 + xvmaddasp 7,5,41 + xvmaddasp 8,5,42 + xvmaddasp 9,5,43 + xvmaddasp 10,5,44 + xvmaddasp 11,5,45 + xvmaddasp 12,5,33 + xvmaddasp 0,5,32 + vperm 8,8,8,6 + vperm 9,9,9,6 + vperm 10,10,10,6 + vperm 11,11,11,6 + vperm 12,12,12,6 + vperm 13,13,13,6 + vperm 1,1,1,6 + vperm 0,0,0,6 +#endif + xvmaddasp 6,39,40 + xvmaddasp 7,39,41 + xvmaddasp 8,39,42 + xvmaddasp 9,39,43 + xvmaddasp 10,39,44 + xvmaddasp 11,39,45 + xvmaddasp 12,39,33 + xvmaddasp 0,39,32 +#ifdef CONJ + vperm 8,8,8,6 + vperm 9,9,9,6 + vperm 10,10,10,6 + vperm 11,11,11,6 + vperm 12,12,12,6 + vperm 13,13,13,6 + vperm 1,1,1,6 + vperm 0,0,0,6 + xvmaddasp 6,5,40 + xvmaddasp 7,5,41 + xvmaddasp 8,5,42 + xvmaddasp 9,5,43 + xvmaddasp 10,5,44 + xvmaddasp 11,5,45 + xvmaddasp 12,5,33 + xvmaddasp 0,5,32 +#endif + xxpermdi 6,6,6,2 + xxpermdi 7,7,7,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + stxvd2x 6,0,9 + xxpermdi 10,10,10,2 + stxvd2x 7,9,30 + xxpermdi 11,11,11,2 + stxvd2x 8,9,31 + xxpermdi 12,12,12,2 + stxvd2x 9,9,12 + xxpermdi 0,0,0,2 + stxvd2x 10,9,0 + stxvd2x 11,9,11 + stxvd2x 12,9,3 + stxvd2x 0,9,5 + addi 9,9,128 + bgt 7,.L6 + ld 29,-24(1) + ld 30,-16(1) +.L5: + cmpd 7,7,4 + ble 7,.L36 + sldi 11,4,1 + ld 28,-32(1) + b .L4 +.L36: + ld 28,-32(1) + ld 31,-8(1) + b .L33 +.L44: + li 31,1 + mtctr 31 + b .L11 +.L26: + li 10,1 + mtctr 10 + b .L13 + .long 0 + .byte 0,0,0,0,0,4,0,0 + .size caxpy_k,.-caxpy_k + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type swap_mask_arr, @object + .size swap_mask_arr, 16 +swap_mask_arr: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 7 + .byte 6 + .byte 5 + .byte 4 +.LC3: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .gnu_attribute 4, 1 + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/caxpy_power9.S b/kernel/power/caxpy_power9.S new file mode 100644 index 000000000..48e6e5ba3 --- /dev/null +++ b/kernel/power/caxpy_power9.S @@ -0,0 +1,538 @@ +#define ASSEMBLER +#include "common.h" + +/* + .file "caxpy.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl caxpy_k + .type caxpy_k, @function +*/ + + PROLOGUE + +caxpy_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry caxpy_k,.-caxpy_k + mr. 7,3 + ble 0,.L33 + cmpdi 7,9,1 + beq 7,.L37 +.L3: + mtctr 7 + ld 7,96(1) + sldi 9,9,3 + sldi 7,7,3 + .p2align 4,,15 +.L14: + lfs 10,4(8) + lfs 11,0(8) + lfs 12,0(10) + lfs 0,4(10) + fmuls 10,2,10 +#ifdef CONJ + fmadds 11,11,1,10 +#else + fmsubs 11,11,1,10 +#endif + fadds 12,12,11 + stfs 12,0(10) + lfs 11,0(8) + lfs 12,4(8) + add 8,8,9 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfs 0,4(10) + add 10,10,7 + bdnz .L14 +.L33: + li 3,0 + blr + .p2align 4,,15 +.L37: + ld 6,96(1) + cmpdi 7,6,1 + bne 7,.L3 + rldicr. 4,7,0,59 + li 11,0 + bne 0,.L38 +.L4: + addi 6,11,8 + subf 0,4,7 + sldi 6,6,2 + addi 9,6,-32 + add 5,10,6 + add 6,8,6 + add 3,8,9 + add 9,10,9 + subfc 5,5,3 + subfe 5,5,5 + subfc 6,6,9 + subfe 12,12,12 + addi 6,5,1 + addi 5,12,1 + or 6,6,5 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + sradi 6,4,63 + srdi 5,7,63 + subfc 12,7,4 + adde 6,5,6 + subfic 12,0,4 + subfe 12,12,12 + xori 6,6,0x1 + neg 12,12 + and 6,6,12 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + cmpd 7,4,7 + li 6,1 + blt 7,.L39 +.L9: + addi 0,7,-1 + subf 0,4,0 + subfic 0,0,3 + subfe 12,12,12 + addi 0,12,1 + rlwinm 0,0,0,0xff + cmpwi 7,0,0 + bne 7,.L10 + sradi 0,4,63 + subfc 12,7,4 + adde 5,5,0 + rlwinm 5,5,0,0xff + cmpwi 7,5,0 + bne 7,.L10 + xscvdpspn 0,1 + xscvdpspn 12,2 + addi 0,6,-1 + std 31,-8(1) + addis 12,2,.LC2@toc@ha + addis 6,2,.LC3@toc@ha + li 5,16 + srdi. 31,0,2 + addi 6,6,.LC3@toc@l + addi 12,12,.LC2@toc@l + mtctr 31 + lxv 41,0(6) + lxv 42,0(12) + li 6,0 + xxspltw 0,0,0 + xxspltw 12,12,0 + beq 0,.L40 + .p2align 4,,15 +.L11: +#ifdef CONJ + lxvx 33,3,5 + lxvx 44,3,6 + lxvx 43,9,6 + lxvx 32,9,5 + vperm 13,1,12,10 + vperm 12,1,12,9 + vperm 8,0,11,10 + vperm 0,0,11,9 + xvmulsp 33,12,44 + xvmulsp 11,12,45 + xvmaddasp 33,0,45 + xvmsubmsp 44,0,11 + xvaddsp 33,33,40 + xvsubsp 32,32,44 +#else + lxvx 33,3,6 + lxvx 32,3,5 + lxvx 43,9,6 + lxvx 44,9,5 + vperm 13,0,1,10 + vperm 0,0,1,9 + vperm 8,12,11,10 + vperm 12,12,11,9 + xvmulsp 33,12,32 + xvmulsp 11,12,45 + xvmsubasp 33,0,45 + xvmaddmsp 32,0,11 + xvaddsp 33,33,40 + xvaddsp 32,32,44 +#endif + vmrglw 13,0,1 + vmrghw 0,0,1 + stxvx 45,9,6 + stxvx 32,9,5 + addi 6,6,32 + addi 5,5,32 + bdnz .L11 + rldicr 0,0,0,61 + ld 31,-8(1) + sldi 9,0,1 + add 4,4,0 + add 11,11,9 +.L10: + sldi 5,11,2 + addi 6,4,1 + addi 9,11,2 + addi 3,5,4 + lfsx 12,8,5 + cmpd 7,7,6 + lfsx 0,10,5 + lfsx 11,8,3 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,12,1,11 +#else + fmsubs 12,12,1,11 +#endif + fadds 0,0,12 + stfsx 0,10,5 + lfsx 11,8,5 + lfsx 12,8,3 + lfsx 0,10,3 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfsx 0,10,3 + ble 7,.L33 + sldi 9,9,2 + addi 5,4,2 + addi 6,11,4 + addi 3,9,4 + lfsx 12,8,9 + cmpd 7,7,5 + lfsx 0,10,9 + lfsx 11,8,3 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,3 + lfsx 0,10,3 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfsx 0,10,3 + ble 7,.L33 + sldi 6,6,2 + addi 4,4,3 + addi 9,11,6 + addi 5,6,4 + lfsx 12,8,6 + cmpd 7,7,4 + lfsx 0,10,6 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfsx 0,10,5 + ble 7,.L33 + sldi 9,9,2 + addi 7,9,4 + lfsx 12,8,9 + lfsx 0,10,9 + lfsx 11,8,7 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,7 + lfsx 0,10,7 + fmuls 2,2,11 +#ifdef CONJ + fmsubs 1,1,12,2 + fsubs 1,0,1 +#else + fmadds 1,1,12,2 + fadds 1,0,1 +#endif + stfsx 1,10,7 + b .L33 +.L39: + mr 6,0 + b .L9 +.L38: +#ifdef CONJ + fneg 0,1 + xxpermdi 45,1,1,0 + xscvdpspn 12,2 + addis 9,2,.LANCHOR0@toc@ha + sradi. 3,4,1 + xxpermdi 44,0,0,0 + addi 9,9,.LANCHOR0@toc@l + xvcvdpsp 45,45 + lxv 33,0(9) + xvcvdpsp 32,44 + xxspltw 12,12,0 +#else + fneg 12,2 + xxpermdi 32,2,2,0 + xscvdpspn 0,1 + addis 9,2,.LANCHOR0@toc@ha + sradi. 3,4,1 + xxpermdi 45,12,12,0 + addi 9,9,.LANCHOR0@toc@l + xvcvdpsp 32,32 + lxv 33,0(9) + xvcvdpsp 45,45 + xxspltw 0,0,0 +#endif + vmrgew 0,0,13 + beq 0,.L5 + mr 6,8 + mr 9,10 + li 5,0 + .p2align 4,,15 +.L6: + lxv 38,16(6) + lxv 11,16(9) + addi 5,5,8 + addi 6,6,128 + addi 9,9,128 + lxv 39,-96(6) + lxv 40,-80(6) + lxv 41,-64(6) + lxv 42,-48(6) + cmpd 7,3,5 + lxv 43,-32(6) + lxv 45,-128(6) + lxv 44,-16(6) +#ifdef CONJ + lxv 0,-128(9) + vpermr 17,6,6,1 + xvmaddmsp 38,32,11 + lxv 11,-96(9) + vpermr 18,7,7,1 + vpermr 19,8,8,1 + vpermr 2,9,9,1 + vpermr 3,10,10,1 + vpermr 4,11,11,1 + xvmaddasp 0,32,45 + vpermr 5,12,12,1 + xvmaddmsp 39,32,11 + lxv 11,-80(9) + vpermr 13,13,13,1 + xvmaddasp 38,12,49 + xvmaddmsp 40,32,11 + lxv 11,-64(9) + xvmaddmsp 45,12,0 + xvmaddasp 39,12,50 + stxv 38,-112(9) + xvmaddmsp 41,32,11 + lxv 11,-48(9) + xvmaddasp 40,12,51 + stxv 45,-128(9) + stxv 39,-96(9) + xvmaddmsp 42,32,11 + lxv 11,-32(9) + xvmaddasp 41,12,34 + stxv 40,-80(9) + xvmaddmsp 43,32,11 + lxv 11,-16(9) + xvmaddasp 42,12,35 + stxv 41,-64(9) + xvmaddmsp 44,32,11 + xvmaddasp 43,12,36 + stxv 42,-48(9) + xvmaddasp 44,12,37 +#else + lxv 12,-128(9) + vpermr 17,6,6,1 + xvmaddmsp 38,0,11 + lxv 11,-96(9) + vpermr 18,7,7,1 + vpermr 19,8,8,1 + vpermr 2,9,9,1 + vpermr 3,10,10,1 + vpermr 4,11,11,1 + xvmaddasp 12,0,45 + vpermr 5,12,12,1 + xvmaddmsp 39,0,11 + lxv 11,-80(9) + vpermr 13,13,13,1 + xvmaddasp 38,32,49 + xvmaddmsp 40,0,11 + lxv 11,-64(9) + xvmaddmsp 45,32,12 + xvmaddasp 39,32,50 + stxv 38,-112(9) + xvmaddmsp 41,0,11 + lxv 11,-48(9) + xvmaddasp 40,32,51 + stxv 45,-128(9) + stxv 39,-96(9) + xvmaddmsp 42,0,11 + lxv 11,-32(9) + xvmaddasp 41,32,34 + stxv 40,-80(9) + xvmaddmsp 43,0,11 + lxv 11,-16(9) + xvmaddasp 42,32,35 + stxv 41,-64(9) + xvmaddmsp 44,0,11 + xvmaddasp 43,32,36 + stxv 42,-48(9) + xvmaddasp 44,32,37 +#endif + stxv 43,-32(9) + stxv 44,-16(9) + bgt 7,.L6 +.L5: + cmpd 7,7,4 + ble 7,.L33 + sldi 11,4,1 + b .L4 +.L7: + addi 10,4,1 + subf 8,4,7 + cmpd 7,10,7 + mtctr 8 + bgt 7,.L26 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,7,10 + beq 7,.L26 + .p2align 4,,15 +.L13: + lfs 10,4(3) + lfs 11,0(3) + lfs 12,0(9) + lfs 0,4(9) + addi 3,3,8 + addi 9,9,8 + fmuls 10,2,10 +#ifdef CONJ + fmadds 11,1,11,10 +#else + fmsubs 11,1,11,10 +#endif + fadds 12,12,11 + stfs 12,-8(9) + lfs 11,-8(3) + lfs 12,-4(3) + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfs 0,-4(9) + bdnz .L13 + b .L33 +.L40: + li 31,1 + mtctr 31 + b .L11 +.L26: + li 10,1 + mtctr 10 + b .L13 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size caxpy_k,.-caxpy_k + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type swap_mask_arr, @object + .size swap_mask_arr, 16 +swap_mask_arr: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 7 + .byte 6 + .byte 5 + .byte 4 +.LC3: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .gnu_attribute 4, 1 + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S new file mode 100644 index 000000000..01d194c0c --- /dev/null +++ b/kernel/power/cdot_power9.S @@ -0,0 +1,242 @@ + .file "cdot.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl cdot_k + .type cdot_k, @function +cdot_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry cdot_k,.-cdot_k + mr. 9,3 + ble 0,.L10 + cmpdi 7,5,1 + beq 7,.L18 +.L3: + mtctr 9 + xxlxor 2,2,2 + sldi 5,5,3 + sldi 7,7,3 +#ifdef CONJ + fmr 12,2 +#endif + fmr 8,2 +#ifndef CONJ + fmr 9,2 +#endif + fmr 1,2 + .p2align 4,,15 +.L9: +#ifdef CONJ + lfs 9,0(4) + lfs 11,0(6) + lfs 10,4(6) + lfs 0,4(4) + add 6,6,7 + add 4,4,5 + fmadds 1,9,11,1 + fmadds 12,9,10,12 + fmadds 8,0,10,8 + fmadds 2,11,0,2 +#else + lfs 10,0(4) + lfs 12,0(6) + lfs 11,4(6) + lfs 0,4(4) + add 6,6,7 + add 4,4,5 + fmadds 1,10,12,1 + fmadds 8,10,11,8 + fmadds 9,0,11,9 + fmadds 2,12,0,2 +#endif + bdnz .L9 +.L7: +#ifdef CONJ + fsubs 2,12,2 + fadds 1,1,8 +#else + fadds 2,2,8 + fsubs 1,1,9 +#endif + blr + .p2align 4,,15 +.L18: + cmpdi 7,7,1 + bne 7,.L3 + rldicr. 10,9,0,60 + bne 0,.L19 + xxlxor 2,2,2 + li 8,0 +#ifdef CONJ + fmr 12,2 +#endif + fmr 8,2 +#ifndef CONJ + fmr 9,2 +#endif + fmr 1,2 +.L4: + addi 7,10,1 + sldi 8,8,2 + subf 10,10,9 + cmpd 7,7,9 + mtctr 10 + add 4,4,8 + add 6,6,8 + bgt 7,.L16 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L16 + .p2align 4,,15 +.L8: +#ifdef CONJ + lfs 9,0(4) + lfs 11,0(6) + lfs 10,4(6) + lfs 0,4(4) + addi 6,6,8 + addi 4,4,8 + fmadds 1,9,11,1 + fmadds 12,9,10,12 + fmadds 8,0,10,8 + fmadds 2,11,0,2 +#else + lfs 10,0(4) + lfs 12,0(6) + lfs 11,4(6) + lfs 0,4(4) + addi 6,6,8 + addi 4,4,8 + fmadds 1,10,12,1 + fmadds 8,10,11,8 + fmadds 9,0,11,9 + fmadds 2,12,0,2 +#endif + bdnz .L8 + b .L7 + .p2align 4,,15 +.L10: + xxlxor 1,1,1 + fmr 2,1 + blr +.L19: + addis 8,2,.LANCHOR0@toc@ha + sradi. 3,10,1 + xxspltib 42,0 + addi 8,8,.LANCHOR0@toc@l + lxv 32,0(8) + beq 0,.L12 + xxlor 6,42,42 + xxlor 4,42,42 + xxlor 0,42,42 + xxlor 7,42,42 + xxlor 5,42,42 + xxlor 3,42,42 + xxlor 12,42,42 + mr 7,4 + mr 8,6 + li 5,0 + .p2align 4,,15 +.L6: + lxv 43,0(8) + lxv 44,16(8) + addi 5,5,4 + addi 8,8,64 + addi 7,7,64 + lxv 45,-32(8) + lxv 33,-16(8) + lxv 8,-64(7) + lxv 9,-48(7) + cmpd 7,3,5 + lxv 10,-32(7) + lxv 11,-16(7) + vpermr 6,11,11,0 + vpermr 7,12,12,0 + vpermr 8,13,13,0 + vpermr 9,1,1,0 + xvmaddasp 12,43,8 + xvmaddasp 3,44,9 + xvmaddasp 0,8,38 + xvmaddasp 4,9,39 + xvmaddasp 6,10,40 + xvmaddasp 5,45,10 + xvmaddasp 42,11,41 + xvmaddasp 7,33,11 + bgt 7,.L6 + xvaddsp 12,12,3 + xvaddsp 0,0,4 + xvaddsp 12,12,5 + xvaddsp 0,0,6 + xvaddsp 12,12,7 + xvaddsp 42,0,42 +.L5: +#ifdef CONJ + xxpermdi 8,12,12,2 + xxpermdi 0,42,42,2 + cmpd 7,9,10 + sldi 8,10,1 + xvaddsp 8,8,12 + xvaddsp 0,0,42 + xxsldwi 1,8,8,3 + xxsldwi 12,0,0,3 + xxsldwi 8,8,8,2 + xxsldwi 0,0,0,2 + xscvspdp 1,1 + xscvspdp 12,12 + xscvspdp 8,8 +#else + xxpermdi 9,12,12,2 + xxpermdi 0,42,42,2 + cmpd 7,9,10 + sldi 8,10,1 + xvaddsp 9,9,12 + xvaddsp 0,0,42 + xxsldwi 1,9,9,3 + xxsldwi 2,0,0,3 + xxsldwi 9,9,9,2 + xxsldwi 0,0,0,2 + xscvspdp 8,2 + xscvspdp 1,1 + xscvspdp 9,9 +#endif + xscvspdp 2,0 + bgt 7,.L4 + b .L7 +.L12: + xxlor 12,42,42 + b .L5 +.L16: + li 9,1 + mtctr 9 + b .L8 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size cdot_k,.-cdot_k + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type swap_mask_arr, @object + .size swap_mask_arr, 16 +swap_mask_arr: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamax_power8.S b/kernel/power/icamax_power8.S new file mode 100644 index 000000000..4872aff40 --- /dev/null +++ b/kernel/power/icamax_power8.S @@ -0,0 +1,458 @@ +/* .file "icamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamax_k + .type icamax_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +icamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamax_k,.-icamax_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L54 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,9,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + mtctr 9 + add 4,4,5 + li 3,0 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L52: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L54: + rldicr. 8,9,0,58 + bne 0,.L55 + addi 7,8,1 + li 10,0 + xxlxor 11,11,11 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + li 3,0 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L52 + .p2align 4,,15 +.L55: + li 0,-144 + std 31,-8(1) + addis 5,2,.LC2@toc@ha + vspltisw 18,0 + vspltisw 19,0 + addis 6,2,.LC3@toc@ha + addi 5,5,.LC2@toc@l + stvx 24,1,0 + li 0,-128 + addi 6,6,.LC3@toc@l + xxlor 49,50,50 + addis 7,2,.LC4@toc@ha + lxvd2x 44,0,5 + addis 10,2,.LC5@toc@ha + stvx 25,1,0 + li 0,-112 + addi 7,7,.LC4@toc@l + lxvd2x 45,0,6 + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + stvx 26,1,0 + li 0,-96 + addi 10,10,.LC5@toc@l + addi 6,6,.LC7@toc@l + addi 5,5,.LC6@toc@l + stvx 27,1,0 + li 0,-80 + lxvd2x 46,0,10 + xxpermdi 44,44,44,2 + mr 10,4 + lxvd2x 48,0,6 + lxvd2x 47,0,5 + xxpermdi 45,45,45,2 + li 6,0 + stvx 28,1,0 + li 0,-64 + xxlnand 44,44,44 + xxlnand 45,45,45 + stvx 29,1,0 + li 0,-48 + vspltisw 29,8 + vadduwm 29,29,29 + xxpermdi 46,46,46,2 + stvx 30,1,0 + li 0,-32 + xxpermdi 47,47,47,2 + xxpermdi 48,48,48,2 + stvx 31,1,0 + lxvd2x 63,0,7 + addis 7,2,.LC8@toc@ha + addi 7,7,.LC8@toc@l + lxvd2x 62,0,7 + xxpermdi 63,63,63,2 + .p2align 4,,15 +.L5: + addi 3,10,16 + addi 5,10,32 + lxvd2x 34,0,10 + addi 7,10,64 + addi 31,10,48 + addi 12,10,80 + addi 11,10,96 + lxvd2x 36,0,3 + lxvd2x 37,0,5 + addi 3,10,112 + addi 5,10,128 + lxvd2x 38,0,7 + lxvd2x 7,0,31 + addi 7,10,160 + addi 31,10,144 + lxvd2x 33,0,12 + lxvd2x 39,0,11 + addi 12,10,176 + addi 11,10,192 + lxvd2x 8,0,3 + lxvd2x 40,0,5 + xxpermdi 34,34,34,2 + addi 3,10,208 + addi 5,10,224 + lxvd2x 41,0,7 + lxvd2x 9,0,31 + addi 7,10,240 + lxvd2x 10,0,12 + lxvd2x 42,0,11 + xxpermdi 37,37,37,2 + xxpermdi 36,36,36,2 + addi 6,6,32 + lxvd2x 32,0,3 + lxvd2x 43,0,5 + xxpermdi 7,7,7,2 + xxpermdi 38,38,38,2 + cmpd 7,8,6 + addi 10,10,256 + lxvd2x 11,0,7 + xxpermdi 39,39,39,2 + xxpermdi 33,33,33,2 + xxpermdi 40,40,40,2 + xxpermdi 8,8,8,2 + xxpermdi 41,41,41,2 + xxpermdi 9,9,9,2 + xxpermdi 10,10,10,2 + xxpermdi 42,42,42,2 + xxpermdi 43,43,43,2 + xxpermdi 32,32,32,2 + xxpermdi 11,11,11,2 + xvabssp 57,37 + xvabssp 58,39 + xvabssp 35,40 + xvabssp 59,41 + xvabssp 34,34 + xvabssp 33,33 + xvabssp 32,32 + xvabssp 60,43 + xvabssp 36,36 + xvabssp 37,7 + xvabssp 38,38 + xvabssp 39,8 + xvabssp 40,9 + xvabssp 41,10 + xvabssp 42,42 + xvabssp 43,11 + vperm 24,4,2,12 + vperm 4,4,2,13 + vperm 2,5,25,12 + vperm 5,5,25,13 + vperm 25,1,6,12 + vperm 6,1,6,13 + vperm 1,7,26,12 + vperm 7,7,26,13 + vperm 26,8,3,12 + vperm 8,8,3,13 + vperm 3,9,27,12 + vperm 9,9,27,13 + vperm 27,0,10,12 + vperm 10,0,10,13 + vperm 0,11,28,12 + vperm 11,11,28,13 + xvaddsp 12,33,39 + xvaddsp 38,57,38 + xvaddsp 0,32,43 + xvaddsp 42,59,42 + xvaddsp 36,56,36 + xvaddsp 37,34,37 + xvaddsp 40,58,40 + xvaddsp 41,35,41 + xvcmpgtsp 32,12,38 + xvcmpgtsp 33,0,42 + xvcmpgtsp 43,37,36 + xvcmpgtsp 39,41,40 + xxsel 12,38,12,32 + xxsel 38,47,48,32 + xxsel 0,42,0,33 + xxsel 42,47,48,33 + xxsel 37,36,37,43 + xxsel 43,63,46,43 + xxsel 41,40,41,39 + xxsel 39,63,46,39 + xvcmpgtsp 32,12,37 + xvcmpgtsp 33,0,41 + xxsel 12,37,12,32 + xxsel 43,43,38,32 + xxsel 0,41,0,33 + xxsel 33,39,42,33 + xvcmpgtsp 32,0,12 + vadduwm 1,1,29 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,0,51 + vadduwm 0,17,0 + vadduwm 17,17,30 + xxsel 50,50,32,33 + xxsel 51,51,0,33 + bgt 7,.L5 + xxsldwi 11,51,51,3 + xxsldwi 12,51,51,2 + vspltw 0,18,3 + xxsldwi 0,51,51,1 + xscvspdp 11,11 + xscvspdp 12,12 + mfvsrwz 6,32 + vspltw 0,18,2 + xscvspdp 0,0 + mfvsrwz 7,50 + mfvsrwz 5,32 + vspltw 0,18,0 + xscvspdp 51,51 + mfvsrwz 10,32 + fcmpu 7,11,12 + rldicl 3,6,0,32 + fmr 10,0 + rldicl 11,7,0,32 + rldicl 31,5,0,32 + rldicl 0,10,0,32 + beq 7,.L56 + bnl 7,.L8 + fmr 11,12 + mr 3,31 +.L8: + xscmpudp 7,0,51 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L57 + blt 7,.L58 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + li 0,-144 + ld 31,-8(1) + addi 3,3,1 + lvx 24,1,0 + li 0,-128 + lvx 25,1,0 + li 0,-112 + lvx 26,1,0 + li 0,-96 + lvx 27,1,0 + li 0,-80 + lvx 28,1,0 + li 0,-64 + lvx 29,1,0 + li 0,-48 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L56: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bnl 7,.L13 + xscpsgndp 10,51,51 + mr 11,0 + b .L13 + .p2align 4,,15 +.L57: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 + .p2align 4,,15 +.L58: + fmr 11,10 + mr 3,11 + b .L17 +.L43: + li 9,1 + mtctr 9 + b .L44 +.L37: + li 9,1 + mtctr 9 + b .L21 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size icamax_k,.-icamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 +.LC8: + .long 32 + .long 32 + .long 32 + .long 32 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamax_power9.S b/kernel/power/icamax_power9.S new file mode 100644 index 000000000..2968b3f8b --- /dev/null +++ b/kernel/power/icamax_power9.S @@ -0,0 +1,387 @@ + .file "icamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamax_k + .type icamax_k, @function +icamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamax_k,.-icamax_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L53 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,9,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + li 3,0 + mtctr 9 + add 4,4,5 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L51: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L53: + rldicr. 8,9,0,58 + bne 0,.L54 + addi 7,8,1 + li 10,0 + subf 6,8,9 + li 3,0 + xxlxor 11,11,11 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L51 + .p2align 4,,15 +.L54: + addis 11,2,.LC2@toc@ha + addis 3,2,.LC3@toc@ha + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + xxspltib 47,0 + addis 7,2,.LC4@toc@ha + addis 10,2,.LC5@toc@ha + stxv 58,-96(1) + stxv 59,-80(1) + addi 11,11,.LC2@toc@l + addi 3,3,.LC3@toc@l + addi 5,5,.LC6@toc@l + addi 6,6,.LC7@toc@l + stxv 62,-32(1) + stxv 63,-16(1) + xxspltib 58,16 + addi 7,7,.LC4@toc@l + addi 10,10,.LC5@toc@l + xxspltib 59,32 + lxv 44,0(11) + lxv 45,0(3) + xxspltib 48,0 + lxv 62,0(5) + xxlor 46,47,47 + lxv 63,0(6) + stxv 60,-64(1) + stxv 61,-48(1) + lxv 60,0(7) + lxv 61,0(10) + li 7,0 + mr 10,4 + vextsb2w 26,26 + vextsb2w 27,27 + stxv 56,-128(1) + stxv 57,-112(1) + .p2align 4,,15 +.L5: + lxv 0,0(10) + addi 7,7,32 + addi 10,10,256 + cmpd 7,8,7 + xvabssp 34,0 + lxv 0,-240(10) + xvabssp 42,0 + lxv 0,-224(10) + xvabssp 49,0 + lxv 0,-208(10) + vpermr 25,10,2,12 + vpermr 2,10,2,13 + xvabssp 35,0 + lxv 0,-192(10) + xvaddsp 34,57,34 + xvabssp 36,0 + lxv 0,-176(10) + vpermr 10,3,17,12 + vpermr 3,3,17,13 + xvabssp 33,0 + lxv 0,-160(10) + xvaddsp 10,42,35 + xvabssp 50,0 + lxv 0,-144(10) + vpermr 17,1,4,12 + vpermr 4,1,4,13 + xvabssp 37,0 + lxv 0,-128(10) + xvaddsp 36,49,36 + xvabssp 38,0 + lxv 0,-112(10) + vpermr 1,5,18,12 + vpermr 5,5,18,13 + xvabssp 43,0 + lxv 0,-96(10) + xvaddsp 12,33,37 + xvabssp 51,0 + lxv 0,-80(10) + vpermr 18,11,6,12 + vpermr 6,11,6,13 + xvabssp 39,0 + lxv 0,-64(10) + xvaddsp 38,50,38 + xvabssp 40,0 + lxv 0,-48(10) + vpermr 11,7,19,12 + vpermr 7,7,19,13 + xvabssp 32,0 + lxv 0,-32(10) + xvaddsp 11,43,39 + xvcmpgtsp 39,10,34 + xvcmpgtsp 43,12,36 + xvabssp 56,0 + lxv 0,-16(10) + vpermr 19,0,8,12 + vpermr 8,0,8,13 + xxsel 10,34,10,39 + xxsel 12,36,12,43 + xxsel 39,60,61,39 + xxsel 43,62,63,43 + xvabssp 41,0 + xvaddsp 40,51,40 + vpermr 0,9,24,12 + vpermr 9,9,24,13 + xvaddsp 0,32,41 + xvcmpgtsp 41,11,38 + xvcmpgtsp 32,12,10 + xvcmpgtsp 42,0,40 + xxsel 11,38,11,41 + xxsel 12,10,12,32 + xxsel 43,39,43,32 + xxsel 41,60,61,41 + xxsel 0,40,0,42 + xxsel 42,62,63,42 + xvcmpgtsp 33,0,11 + xxsel 0,11,0,33 + xxsel 33,41,42,33 + xvcmpgtsp 32,0,12 + vadduwm 1,1,26 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,0,48 + vadduwm 0,14,0 + vadduwm 14,14,27 + xxsel 47,47,32,33 + xxsel 48,48,0,33 + bgt 7,.L5 + xxsldwi 11,48,48,3 + xxsldwi 12,48,48,2 + li 10,0 + li 3,12 + xxsldwi 0,48,48,1 + xscvspdp 48,48 + vextuwrx 6,10,15 + li 10,4 + xscvspdp 11,11 + xscvspdp 12,12 + xscvspdp 0,0 + vextuwrx 5,10,15 + li 10,8 + vextuwrx 7,10,15 + vextuwrx 10,3,15 + rldicl 12,5,0,32 + rldicl 3,6,0,32 + rldicl 11,7,0,32 + rldicl 0,10,0,32 + fcmpu 7,11,12 + fmr 10,0 + beq 7,.L55 + bnl 7,.L8 + mr 3,12 + fmr 11,12 +.L8: + xscmpudp 7,0,48 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L56 + bnl 7,.L17 + mr 3,11 + fmr 11,10 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + subf 6,8,9 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + lxv 56,-128(1) + lxv 57,-112(1) + addi 3,3,1 + lxv 58,-96(1) + lxv 59,-80(1) + lxv 60,-64(1) + lxv 61,-48(1) + lxv 62,-32(1) + lxv 63,-16(1) + blr + .p2align 4,,15 +.L55: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bnl 7,.L13 + mr 11,0 + xscpsgndp 10,48,48 + b .L13 + .p2align 4,,15 +.L56: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 +.L37: + li 9,1 + mtctr 9 + b .L21 +.L43: + li 9,1 + mtctr 9 + b .L44 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size icamax_k,.-icamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S new file mode 100644 index 000000000..e3d66798e --- /dev/null +++ b/kernel/power/icamin_power8.S @@ -0,0 +1,454 @@ +/* .file "icamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamin_k + .type icamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +icamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamin_k,.-icamin_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,5,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L54 + cmpdi 7,9,1 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + mtctr 9 + add 4,4,5 + li 3,0 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bnl 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L52: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L54: + rldicr. 8,9,0,58 + bne 0,.L55 + addi 7,8,1 + li 10,0 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + li 3,0 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L52 + .p2align 4,,15 +.L55: + li 0,-128 + std 31,-8(1) + addis 5,2,.LC2@toc@ha + xscvdpspn 11,11 + vspltisw 19,0 + addis 6,2,.LC3@toc@ha + addi 5,5,.LC2@toc@l + stvx 25,1,0 + li 0,-112 + addi 6,6,.LC3@toc@l + xxlor 50,51,51 + addis 7,2,.LC4@toc@ha + lxvd2x 44,0,5 + addis 10,2,.LC5@toc@ha + stvx 26,1,0 + li 0,-96 + addi 7,7,.LC4@toc@l + lxvd2x 45,0,6 + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + stvx 27,1,0 + li 0,-80 + addi 10,10,.LC5@toc@l + xxspltw 5,11,0 + addi 6,6,.LC7@toc@l + addi 5,5,.LC6@toc@l + stvx 28,1,0 + li 0,-64 + lxvd2x 47,0,10 + xxpermdi 44,44,44,2 + mr 10,4 + lxvd2x 49,0,6 + lxvd2x 48,0,5 + xxpermdi 45,45,45,2 + li 6,0 + stvx 29,1,0 + li 0,-48 + xxlnand 44,44,44 + xxlnand 45,45,45 + stvx 30,1,0 + lxvd2x 62,0,7 + addis 7,2,.LC8@toc@ha + li 0,-32 + addi 7,7,.LC8@toc@l + xxpermdi 47,47,47,2 + stvx 31,1,0 + vspltisw 31,8 + xxpermdi 48,48,48,2 + lxvd2x 46,0,7 + vadduwm 31,31,31 + xxpermdi 49,49,49,2 + xxpermdi 62,62,62,2 + .p2align 4,,15 +.L5: + addi 3,10,16 + addi 5,10,32 + lxvd2x 34,0,10 + addi 7,10,64 + addi 31,10,48 + addi 12,10,80 + addi 11,10,96 + lxvd2x 36,0,3 + lxvd2x 37,0,5 + addi 3,10,112 + addi 5,10,128 + lxvd2x 38,0,7 + lxvd2x 6,0,31 + addi 7,10,160 + addi 31,10,144 + lxvd2x 33,0,12 + lxvd2x 39,0,11 + addi 12,10,176 + addi 11,10,192 + lxvd2x 7,0,3 + lxvd2x 40,0,5 + xxpermdi 34,34,34,2 + addi 3,10,208 + addi 5,10,224 + lxvd2x 41,0,7 + lxvd2x 8,0,31 + addi 7,10,240 + lxvd2x 9,0,12 + lxvd2x 42,0,11 + xxpermdi 37,37,37,2 + xxpermdi 36,36,36,2 + addi 6,6,32 + lxvd2x 32,0,3 + lxvd2x 43,0,5 + xxpermdi 6,6,6,2 + xxpermdi 38,38,38,2 + cmpd 7,8,6 + addi 10,10,256 + lxvd2x 10,0,7 + xxpermdi 39,39,39,2 + xxpermdi 33,33,33,2 + xxpermdi 40,40,40,2 + xxpermdi 7,7,7,2 + xxpermdi 41,41,41,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + xxpermdi 42,42,42,2 + xxpermdi 43,43,43,2 + xxpermdi 32,32,32,2 + xxpermdi 10,10,10,2 + xvabssp 58,37 + xvabssp 59,39 + xvabssp 35,40 + xvabssp 60,41 + xvabssp 34,34 + xvabssp 33,33 + xvabssp 32,32 + xvabssp 61,43 + xvabssp 36,36 + xvabssp 37,6 + xvabssp 38,38 + xvabssp 39,7 + xvabssp 40,8 + xvabssp 41,9 + xvabssp 42,42 + xvabssp 43,10 + vperm 25,4,2,12 + vperm 4,4,2,13 + vperm 2,5,26,12 + vperm 5,5,26,13 + vperm 26,1,6,12 + vperm 6,1,6,13 + vperm 1,7,27,12 + vperm 7,7,27,13 + vperm 27,8,3,12 + vperm 8,8,3,13 + vperm 3,9,28,12 + vperm 9,9,28,13 + vperm 28,0,10,12 + vperm 10,0,10,13 + vperm 0,11,29,12 + vperm 11,11,29,13 + xvaddsp 12,33,39 + xvaddsp 38,58,38 + xvaddsp 0,32,43 + xvaddsp 42,60,42 + xvaddsp 36,57,36 + xvaddsp 37,34,37 + xvaddsp 40,59,40 + xvaddsp 41,35,41 + xvcmpgtsp 32,38,12 + xvcmpgtsp 33,42,0 + xvcmpgtsp 43,36,37 + xvcmpgtsp 39,40,41 + xxsel 12,38,12,32 + xxsel 38,48,49,32 + xxsel 0,42,0,33 + xxsel 42,48,49,33 + xxsel 37,36,37,43 + xxsel 43,62,47,43 + xxsel 41,40,41,39 + xxsel 39,62,47,39 + xvcmpgtsp 32,37,12 + xvcmpgtsp 33,41,0 + xxsel 12,37,12,32 + xxsel 43,43,38,32 + xxsel 0,41,0,33 + xxsel 33,39,42,33 + xvcmpgtsp 32,12,0 + vadduwm 1,1,31 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,5,0 + vadduwm 0,0,18 + vadduwm 18,18,14 + xxsel 51,51,32,33 + xxsel 5,5,0,33 + bgt 7,.L5 + xxsldwi 11,5,5,3 + xxsldwi 12,5,5,2 + vspltw 0,19,3 + xxsldwi 0,5,5,1 + xscvspdp 11,11 + xscvspdp 12,12 + mfvsrwz 6,32 + vspltw 0,19,2 + xscvspdp 0,0 + mfvsrwz 7,51 + mfvsrwz 5,32 + vspltw 0,19,0 + xscvspdp 5,5 + mfvsrwz 10,32 + fcmpu 7,11,12 + rldicl 3,6,0,32 + fmr 10,0 + rldicl 11,7,0,32 + rldicl 31,5,0,32 + rldicl 0,10,0,32 + beq 7,.L56 + bng 7,.L8 + fmr 11,12 + mr 3,31 +.L8: + fcmpu 7,0,5 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L57 + bgt 7,.L58 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + li 0,-128 + ld 31,-8(1) + addi 3,3,1 + lvx 25,1,0 + li 0,-112 + lvx 26,1,0 + li 0,-96 + lvx 27,1,0 + li 0,-80 + lvx 28,1,0 + li 0,-64 + lvx 29,1,0 + li 0,-48 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L56: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bng 7,.L13 + fmr 10,5 + mr 11,0 + b .L13 + .p2align 4,,15 +.L57: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 + .p2align 4,,15 +.L58: + fmr 11,10 + mr 3,11 + b .L17 +.L43: + li 9,1 + mtctr 9 + b .L44 +.L37: + li 9,1 + mtctr 9 + b .L21 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size icamin_k,.-icamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 +.LC8: + .long 32 + .long 32 + .long 32 + .long 32 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamin_power9.S b/kernel/power/icamin_power9.S new file mode 100644 index 000000000..8eaa79f33 --- /dev/null +++ b/kernel/power/icamin_power9.S @@ -0,0 +1,385 @@ + .file "icamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamin_k + .type icamin_k, @function +icamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamin_k,.-icamin_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,5,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L53 + cmpdi 7,9,1 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + li 3,0 + mtctr 9 + add 4,4,5 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bnl 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L51: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L53: + rldicr. 8,9,0,58 + bne 0,.L54 + addi 7,8,1 + li 10,0 + subf 6,8,9 + li 3,0 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L51 + .p2align 4,,15 +.L54: + xscvdpspn 9,11 + addis 11,2,.LC2@toc@ha + addis 3,2,.LC3@toc@ha + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + addis 7,2,.LC4@toc@ha + addis 10,2,.LC5@toc@ha + xxspltib 48,0 + addi 11,11,.LC2@toc@l + addi 3,3,.LC3@toc@l + addi 5,5,.LC6@toc@l + stxv 59,-80(1) + addi 6,6,.LC7@toc@l + stxv 60,-64(1) + stxv 63,-16(1) + addi 7,7,.LC4@toc@l + xxspltib 59,16 + lxv 44,0(11) + xxspltib 60,32 + lxv 45,0(3) + lxv 63,0(5) + xxlor 47,48,48 + lxv 46,0(6) + addi 10,10,.LC5@toc@l + stxv 61,-48(1) + stxv 62,-32(1) + xxspltw 9,9,0 + lxv 61,0(7) + lxv 62,0(10) + li 7,0 + mr 10,4 + vextsb2w 27,27 + vextsb2w 28,28 + stxv 57,-112(1) + stxv 58,-96(1) + .p2align 4,,15 +.L5: + lxv 0,0(10) + addi 7,7,32 + addi 10,10,256 + cmpd 7,8,7 + xvabssp 34,0 + lxv 0,-240(10) + xvabssp 42,0 + lxv 0,-224(10) + xvabssp 49,0 + lxv 0,-208(10) + vpermr 26,10,2,12 + vpermr 2,10,2,13 + xvabssp 35,0 + lxv 0,-192(10) + xvaddsp 34,58,34 + xvabssp 36,0 + lxv 0,-176(10) + vpermr 10,3,17,12 + vpermr 3,3,17,13 + xvabssp 33,0 + lxv 0,-160(10) + xvaddsp 10,42,35 + xvabssp 50,0 + lxv 0,-144(10) + vpermr 17,1,4,12 + vpermr 4,1,4,13 + xvabssp 37,0 + lxv 0,-128(10) + xvaddsp 36,49,36 + xvabssp 38,0 + lxv 0,-112(10) + vpermr 1,5,18,12 + vpermr 5,5,18,13 + xvabssp 43,0 + lxv 0,-96(10) + xvaddsp 12,33,37 + xvabssp 51,0 + lxv 0,-80(10) + vpermr 18,11,6,12 + vpermr 6,11,6,13 + xvabssp 39,0 + lxv 0,-64(10) + xvaddsp 38,50,38 + xvabssp 40,0 + lxv 0,-48(10) + vpermr 11,7,19,12 + vpermr 7,7,19,13 + xvabssp 32,0 + lxv 0,-32(10) + xvaddsp 11,43,39 + xvcmpgtsp 39,34,10 + xvcmpgtsp 43,36,12 + xvabssp 57,0 + lxv 0,-16(10) + vpermr 19,0,8,12 + vpermr 8,0,8,13 + xxsel 10,34,10,39 + xxsel 12,36,12,43 + xxsel 39,61,62,39 + xxsel 43,63,46,43 + xvabssp 41,0 + xvaddsp 40,51,40 + vpermr 0,9,25,12 + vpermr 9,9,25,13 + xvaddsp 0,32,41 + xvcmpgtsp 41,38,11 + xvcmpgtsp 32,10,12 + xvcmpgtsp 42,40,0 + xxsel 11,38,11,41 + xxsel 12,10,12,32 + xxsel 43,39,43,32 + xxsel 41,61,62,41 + xxsel 0,40,0,42 + xxsel 42,63,46,42 + xvcmpgtsp 33,11,0 + xxsel 0,11,0,33 + xxsel 33,41,42,33 + xvcmpgtsp 32,12,0 + vadduwm 1,1,27 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,9,0 + vadduwm 0,0,15 + vadduwm 15,15,28 + xxsel 48,48,32,33 + xxsel 9,9,0,33 + bgt 7,.L5 + xxsldwi 11,9,9,3 + xxsldwi 12,9,9,2 + li 10,0 + li 3,12 + xxsldwi 0,9,9,1 + xscvspdp 9,9 + vextuwrx 6,10,16 + li 10,4 + xscvspdp 11,11 + xscvspdp 12,12 + xscvspdp 0,0 + vextuwrx 5,10,16 + li 10,8 + vextuwrx 7,10,16 + vextuwrx 10,3,16 + rldicl 12,5,0,32 + rldicl 3,6,0,32 + rldicl 11,7,0,32 + rldicl 0,10,0,32 + fcmpu 7,11,12 + fmr 10,0 + beq 7,.L55 + bng 7,.L8 + mr 3,12 + fmr 11,12 +.L8: + fcmpu 7,0,9 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L56 + bng 7,.L17 + mr 3,11 + fmr 11,10 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + subf 6,8,9 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + lxv 57,-112(1) + lxv 58,-96(1) + addi 3,3,1 + lxv 59,-80(1) + lxv 60,-64(1) + lxv 61,-48(1) + lxv 62,-32(1) + lxv 63,-16(1) + blr + .p2align 4,,15 +.L55: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bng 7,.L13 + mr 11,0 + fmr 10,9 + b .L13 + .p2align 4,,15 +.L56: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 +.L37: + li 9,1 + mtctr 9 + b .L21 +.L43: + li 9,1 + mtctr 9 + b .L44 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size icamin_k,.-icamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S new file mode 100644 index 000000000..c8fcaecc3 --- /dev/null +++ b/kernel/power/isamax_power8.S @@ -0,0 +1,434 @@ +/* .file "isamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamax_k + .type isamax_k, @function +*/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +isamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamax_k,.-isamax_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L69 + rldicr. 7,11,0,61 + beq 0,.L40 + sldi 3,5,1 + xxlxor 0,0,0 + sldi 6,5,2 + add 3,3,5 + sldi 0,5,4 + sldi 3,3,2 + sldi 5,5,3 + mr 9,4 + li 8,0 + li 10,0 + .p2align 4,,15 +.L31: + lfs 12,0(9) + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L23 + fmr 0,12 + mr 8,10 +.L23: + lfsx 12,9,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L25 + fmr 0,12 + addi 8,10,1 +.L25: + lfsx 12,9,5 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L27 + fmr 0,12 + addi 8,10,2 +.L27: + lfsx 12,9,3 + add 9,9,0 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L29 + fmr 0,12 + addi 8,10,3 +.L29: + addi 10,10,4 + cmpd 7,7,10 + bgt 7,.L31 + addi 7,7,-1 + srdi 7,7,2 + addi 7,7,1 + sldi 9,7,2 + mulld 7,6,7 + cmpd 7,11,9 + ble 7,.L67 +.L22: + addi 10,9,1 + sldi 7,7,2 + cmpd 7,10,11 + subf 10,9,11 + mtctr 10 + add 4,4,7 + bgt 7,.L54 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L54 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L33 + fmr 0,12 + mr 8,9 +.L33: + addi 9,9,1 + bdnz .L35 +.L67: + addi 3,8,1 + blr + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L69: + rldicr. 10,11,0,57 + bne 0,.L70 + addi 7,10,1 + sldi 9,10,2 + xxlxor 12,12,12 + cmpd 7,7,11 + add 4,4,9 + subf 9,10,11 + li 8,0 + mtctr 9 + bgt 7,.L60 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L60 + .p2align 4,,15 +.L61: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L63 + fmr 12,0 + mr 8,10 +.L63: + addi 10,10,1 + bdnz .L61 + b .L67 + .p2align 4,,15 +.L70: + li 0,-64 + std 31,-8(1) + addis 3,2,.LC2@toc@ha + vspltisw 18,0 + vspltisw 12,0 + addis 5,2,.LC3@toc@ha + addis 6,2,.LC6@toc@ha + stvx 29,1,0 + li 0,-48 + addis 8,2,.LC7@toc@ha + xxlor 35,50,50 + addi 3,3,.LC2@toc@l + addi 5,5,.LC3@toc@l + stvx 30,1,0 + addi 6,6,.LC6@toc@l + li 0,-32 + addi 8,8,.LC7@toc@l + lxvd2x 51,0,3 + lxvd2x 34,0,5 + addis 7,2,.LC4@toc@ha + stvx 31,1,0 + lxvd2x 47,0,6 + addis 9,2,.LC5@toc@ha + addi 7,7,.LC4@toc@l + lxvd2x 48,0,8 + addi 9,9,.LC5@toc@l + vspltisw 17,8 + vadduwm 17,17,17 + lxvd2x 36,0,7 + li 7,0 + lxvd2x 37,0,9 + mr 9,4 + .p2align 4,,15 +.L5: + addi 5,9,16 + addi 6,9,32 + lxvd2x 41,0,9 + vadduwm 31,3,15 + addi 8,9,64 + addi 31,9,48 + addi 12,9,80 + addi 3,9,96 + lxvd2x 5,0,5 + lxvd2x 43,0,6 + addi 5,9,112 + addi 6,9,128 + lxvd2x 1,0,8 + lxvd2x 9,0,31 + addi 8,9,160 + addi 31,9,144 + lxvd2x 6,0,12 + lxvd2x 13,0,3 + addi 12,9,176 + addi 3,9,192 + lxvd2x 11,0,5 + lxvd2x 2,0,6 + xvabssp 41,41 + addi 5,9,208 + addi 6,9,224 + lxvd2x 3,0,8 + lxvd2x 7,0,31 + addi 8,9,240 + lxvd2x 10,0,12 + lxvd2x 4,0,3 + xvabssp 43,43 + xvabssp 5,5 + addi 7,7,64 + lxvd2x 8,0,5 + lxvd2x 0,0,6 + xvabssp 9,9 + xvabssp 1,1 + cmpd 7,10,7 + addi 9,9,256 + lxvd2x 12,0,8 + xvabssp 6,6 + xvabssp 13,13 + xvabssp 11,11 + xvabssp 2,2 + xvabssp 7,7 + xvabssp 3,3 + xvabssp 10,10 + xvabssp 4,4 + xvabssp 8,8 + xvabssp 0,0 + xvabssp 12,12 + xvcmpgtsp 32,5,41 + xvcmpgtsp 61,9,43 + xvcmpgtsp 45,6,1 + xvcmpgtsp 62,11,13 + xvcmpgtsp 38,7,2 + xvcmpgtsp 46,10,3 + xvcmpgtsp 40,8,4 + xvcmpgtsp 39,12,0 + xxsel 5,41,5,32 + xxsel 32,51,34,32 + xxsel 9,43,9,61 + xxsel 6,1,6,45 + xxsel 11,13,11,62 + xxsel 43,51,34,45 + xxsel 7,2,7,38 + xvcmpgtsp 41,9,5 + xxsel 10,3,10,46 + xvcmpgtsp 45,11,6 + xxsel 8,4,8,40 + xxsel 62,36,37,62 + xxsel 0,0,12,39 + xvcmpgtsp 42,10,7 + xxsel 61,36,37,61 + xxsel 40,51,34,40 + xvcmpgtsp 33,0,8 + xxsel 39,36,37,39 + xxsel 38,51,34,38 + xxsel 46,36,37,46 + xxsel 9,5,9,41 + xxsel 41,32,61,41 + xxsel 12,6,11,45 + xxsel 45,43,62,45 + xxsel 11,7,10,42 + xvcmpgtsp 32,12,9 + vadduwm 13,13,17 + xxsel 42,38,46,42 + xxsel 0,8,0,33 + xxsel 33,40,39,33 + xvcmpgtsp 43,0,11 + vadduwm 1,1,17 + xxsel 12,9,12,32 + xxsel 32,41,45,32 + vadduwm 0,3,0 + vadduwm 3,3,16 + xxsel 0,11,0,43 + xxsel 33,42,33,43 + xvcmpgtsp 45,0,12 + vadduwm 1,31,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,0,44 + xxsel 50,50,32,33 + xxsel 44,44,0,33 + bgt 7,.L5 + xxsldwi 12,44,44,1 + xscvspdp 10,44 + vspltw 0,18,0 + xxsldwi 0,44,44,3 + xscvspdp 12,12 + mfvsrwz 3,50 + mfvsrwz 6,32 + vspltw 0,18,3 + xscvspdp 0,0 + xxsldwi 44,44,44,2 + mfvsrwz 7,32 + vspltw 0,18,2 + xscvspdp 44,44 + mfvsrwz 9,32 + fcmpu 7,12,10 + rldicl 8,3,0,32 + rldicl 31,6,0,32 + fmr 11,0 + rldicl 0,7,0,32 + rldicl 5,9,0,32 + beq 7,.L71 + bnl 7,.L8 + fmr 12,10 + mr 8,31 +.L8: + xscmpudp 7,0,44 + bne 7,.L11 + cmplw 7,7,9 + ble 7,.L12 + mr 7,9 +.L12: + rldicl 5,7,0,32 +.L13: + fcmpu 7,12,11 + beq 7,.L72 + bnl 7,.L17 + fmr 12,11 + mr 8,5 +.L17: + cmpd 7,11,10 + ble 7,.L16 + addi 7,10,1 + sldi 9,10,2 + cmpd 7,7,11 + add 4,4,9 + subf 9,10,11 + mtctr 9 + bgt 7,.L53 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L53 + .p2align 4,,15 +.L21: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L19 + fmr 12,0 + mr 8,10 +.L19: + addi 10,10,1 + bdnz .L21 +.L16: + li 0,-64 + ld 31,-8(1) + addi 3,8,1 + lvx 29,1,0 + li 0,-48 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L71: + cmplw 7,3,6 + ble 7,.L7 + mr 3,6 +.L7: + rldicl 8,3,0,32 + b .L8 + .p2align 4,,15 +.L40: + xxlxor 0,0,0 + sldi 6,5,2 + li 8,0 + li 9,0 + b .L22 + .p2align 4,,15 +.L11: + blt 7,.L39 + mr 5,0 + b .L13 + .p2align 4,,15 +.L72: + cmpd 7,8,5 + ble 7,.L17 + mr 8,5 + b .L17 + .p2align 4,,15 +.L39: + xscpsgndp 11,44,44 + b .L13 +.L53: + li 9,1 + mtctr 9 + b .L21 +.L54: + li 10,1 + mtctr 10 + b .L35 +.L60: + li 9,1 + mtctr 9 + b .L61 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size isamax_k,.-isamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 +.LC6: + .long 32 + .long 32 + .long 32 + .long 32 +.LC7: + .long 64 + .long 64 + .long 64 + .long 64 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamax_power9.S b/kernel/power/isamax_power9.S new file mode 100644 index 000000000..9df1e773c --- /dev/null +++ b/kernel/power/isamax_power9.S @@ -0,0 +1,397 @@ + .file "isamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamax_k + .type isamax_k, @function +isamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamax_k,.-isamax_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L69 + rldicr. 7,11,0,61 + beq 0,.L40 + sldi 10,5,1 + sldi 6,5,2 + sldi 0,5,4 + sldi 3,5,3 + mr 9,4 + xxlxor 0,0,0 + li 8,0 + add 5,10,5 + li 10,0 + sldi 5,5,2 + .p2align 4,,15 +.L31: + lfs 12,0(9) + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L23 + fmr 0,12 + mr 8,10 +.L23: + lfsx 12,9,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L25 + fmr 0,12 + addi 8,10,1 +.L25: + lfsx 12,9,3 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L27 + fmr 0,12 + addi 8,10,2 +.L27: + lfsx 12,9,5 + add 9,9,0 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L29 + fmr 0,12 + addi 8,10,3 +.L29: + addi 10,10,4 + cmpd 7,7,10 + bgt 7,.L31 + addi 7,7,-1 + srdi 7,7,2 + addi 7,7,1 + sldi 9,7,2 + mulld 7,6,7 + cmpd 7,11,9 + ble 7,.L67 +.L22: + addi 10,9,1 + sldi 7,7,2 + subf 5,9,11 + cmpd 7,10,11 + mtctr 5 + add 4,4,7 + bgt 7,.L54 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L54 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L33 + fmr 0,12 + mr 8,9 +.L33: + addi 9,9,1 + bdnz .L35 +.L67: + addi 3,8,1 + blr + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L69: + rldicr. 10,11,0,57 + bne 0,.L70 + addi 7,10,1 + sldi 9,10,2 + subf 6,10,11 + li 8,0 + xxlxor 12,12,12 + cmpd 7,7,11 + mtctr 6 + add 4,4,9 + bgt 7,.L60 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L60 + .p2align 4,,15 +.L61: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L63 + fmr 12,0 + mr 8,10 +.L63: + addi 10,10,1 + bdnz .L61 + b .L67 + .p2align 4,,15 +.L70: + addis 6,2,.LC2@toc@ha + addis 7,2,.LC3@toc@ha + addis 8,2,.LC4@toc@ha + addis 9,2,.LC5@toc@ha + xxspltib 46,0 + stxv 61,-48(1) + stxv 62,-32(1) + addi 6,6,.LC2@toc@l + addi 7,7,.LC3@toc@l + stxv 63,-16(1) + xxspltib 61,32 + xxspltib 63,16 + xxspltib 62,64 + addi 8,8,.LC4@toc@l + addi 9,9,.LC5@toc@l + lxv 47,0(6) + xxspltib 34,0 + lxv 48,0(7) + xxlor 51,46,46 + lxv 49,0(8) + lxv 50,0(9) + li 8,0 + mr 9,4 + vextsb2w 29,29 + vextsb2w 31,31 + vextsb2w 30,30 + stxv 59,-80(1) + stxv 60,-64(1) + .p2align 4,,15 +.L5: + lxv 0,0(9) + vadduwm 27,19,29 + lxv 12,240(9) + addi 8,8,64 + addi 9,9,256 + cmpd 7,10,8 + xvabssp 44,0 + lxv 0,-240(9) + xvabssp 12,12 + xvabssp 5,0 + lxv 0,-224(9) + xvabssp 32,0 + lxv 0,-208(9) + xvcmpgtsp 35,5,44 + xvabssp 9,0 + lxv 0,-192(9) + xxsel 5,44,5,35 + xxsel 35,47,48,35 + xvabssp 1,0 + lxv 0,-176(9) + xvcmpgtsp 60,9,32 + xvabssp 6,0 + lxv 0,-160(9) + xxsel 9,32,9,60 + xxsel 60,49,50,60 + xvabssp 13,0 + lxv 0,-144(9) + xvcmpgtsp 42,9,5 + xvcmpgtsp 37,6,1 + xvabssp 11,0 + lxv 0,-128(9) + xxsel 9,5,9,42 + xxsel 42,35,60,42 + xxsel 6,1,6,37 + xxsel 37,47,48,37 + xvabssp 2,0 + lxv 0,-112(9) + xvcmpgtsp 36,11,13 + xvabssp 7,0 + lxv 0,-96(9) + xxsel 11,13,11,36 + xxsel 36,49,50,36 + xvabssp 3,0 + lxv 0,-80(9) + xvcmpgtsp 45,11,6 + xvcmpgtsp 39,7,2 + xvabssp 10,0 + lxv 0,-64(9) + xxsel 7,2,7,39 + xxsel 39,47,48,39 + xvabssp 4,0 + lxv 0,-48(9) + xvcmpgtsp 38,10,3 + xvabssp 8,0 + lxv 0,-32(9) + xxsel 10,3,10,38 + xxsel 38,49,50,38 + xvabssp 0,0 + xvcmpgtsp 43,10,7 + xvcmpgtsp 41,8,4 + xvcmpgtsp 40,12,0 + xxsel 8,4,8,41 + xxsel 41,47,48,41 + xxsel 0,0,12,40 + xxsel 12,6,11,45 + xxsel 11,7,10,43 + xxsel 45,37,36,45 + xvcmpgtsp 33,0,8 + xvcmpgtsp 32,12,9 + vadduwm 13,13,31 + xxsel 40,49,50,40 + xxsel 43,39,38,43 + xxsel 0,8,0,33 + xxsel 12,9,12,32 + xxsel 33,41,40,33 + xxsel 32,42,45,32 + xvcmpgtsp 44,0,11 + vadduwm 1,1,31 + vadduwm 0,19,0 + vadduwm 19,19,30 + xxsel 0,11,0,44 + xxsel 33,43,33,44 + xvcmpgtsp 45,0,12 + vadduwm 1,27,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,0,34 + xxsel 46,46,32,33 + xxsel 34,34,0,33 + bgt 7,.L5 + xxsldwi 12,34,34,3 + xxsldwi 11,34,34,2 + li 9,0 + li 8,12 + xxsldwi 0,34,34,1 + xscvspdp 34,34 + vextuwrx 3,9,14 + li 9,4 + xscvspdp 12,12 + xscvspdp 11,11 + xscvspdp 0,0 + vextuwrx 6,9,14 + li 9,8 + vextuwrx 7,9,14 + vextuwrx 9,8,14 + rldicl 12,6,0,32 + rldicl 8,3,0,32 + rldicl 0,7,0,32 + rldicl 5,9,0,32 + fcmpu 7,12,11 + fmr 10,0 + beq 7,.L71 + bnl 7,.L8 + mr 8,12 + fmr 12,11 +.L8: + xscmpudp 7,0,34 + bne 7,.L11 + cmplw 7,7,9 + ble 7,.L12 + mr 7,9 +.L12: + rldicl 5,7,0,32 +.L13: + fcmpu 7,12,10 + beq 7,.L72 + bnl 7,.L17 + mr 8,5 + fmr 12,10 +.L17: + cmpd 7,11,10 + ble 7,.L16 + addi 7,10,1 + sldi 9,10,2 + subf 6,10,11 + cmpd 7,7,11 + mtctr 6 + add 4,4,9 + bgt 7,.L53 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L53 + .p2align 4,,15 +.L21: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L19 + fmr 12,0 + mr 8,10 +.L19: + addi 10,10,1 + bdnz .L21 +.L16: + lxv 59,-80(1) + lxv 60,-64(1) + addi 3,8,1 + lxv 61,-48(1) + lxv 62,-32(1) + lxv 63,-16(1) + blr + .p2align 4,,15 +.L71: + cmplw 7,3,6 + ble 7,.L7 + mr 3,6 +.L7: + rldicl 8,3,0,32 + b .L8 + .p2align 4,,15 +.L40: + sldi 6,5,2 + li 8,0 + li 9,0 + xxlxor 0,0,0 + b .L22 + .p2align 4,,15 +.L11: + blt 7,.L39 + mr 5,0 + b .L13 + .p2align 4,,15 +.L72: + cmpd 7,8,5 + ble 7,.L17 + mr 8,5 + b .L17 + .p2align 4,,15 +.L39: + xscpsgndp 10,34,34 + b .L13 +.L53: + li 9,1 + mtctr 9 + b .L21 +.L54: + li 10,1 + mtctr 10 + b .L35 +.L60: + li 9,1 + mtctr 9 + b .L61 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size isamax_k,.-isamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S new file mode 100644 index 000000000..3873e879b --- /dev/null +++ b/kernel/power/isamin_power8.S @@ -0,0 +1,417 @@ +/* .file "isamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamin_k + .type isamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +isamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamin_k,.-isamin_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 0,0(4) + li 0,-48 + cmpdi 7,5,1 + stvx 30,1,0 + li 0,-32 + stvx 31,1,0 + fabs 0,0 + beq 7,.L62 + rldicr. 6,11,0,61 + beq 0,.L40 + sldi 0,5,1 + sldi 12,5,2 + std 31,-8(1) + add 0,0,5 + neg 31,5 + sldi 3,5,4 + sldi 0,0,2 + add 7,4,12 + sldi 31,31,2 + sldi 5,5,3 + li 9,0 + li 10,0 + b .L24 + .p2align 4,,15 +.L41: + mr 10,9 +.L25: + fmr 0,12 + add 7,7,3 +.L24: + lfs 12,0(7) + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L26 + fmr 0,12 + addi 10,9,1 +.L26: + add 8,31,7 + lfsx 12,8,5 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L28 + fmr 0,12 + addi 10,9,2 +.L28: + lfsx 12,8,0 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L30 + fmr 0,12 + addi 10,9,3 +.L30: + addi 9,9,4 + cmpd 7,6,9 + ble 7,.L63 + lfsx 12,8,3 + fabs 12,12 + fcmpu 7,12,0 + blt 7,.L41 + fmr 12,0 + b .L25 + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L63: + addi 6,6,-1 + ld 31,-8(1) + srdi 6,6,2 + addi 6,6,1 + sldi 9,6,2 + mulld 6,12,6 + cmpd 7,11,9 + ble 7,.L33 +.L23: + addi 8,9,1 + sldi 6,6,2 + cmpd 7,8,11 + subf 8,9,11 + mtctr 8 + add 4,4,6 + bgt 7,.L52 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L52 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,12 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L34 + fmr 0,12 + mr 10,9 +.L34: + addi 9,9,1 + bdnz .L35 +.L33: + li 0,-48 + addi 3,10,1 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L62: + rldicr. 8,11,0,57 + li 10,0 + bne 0,.L64 +.L4: + addi 7,8,1 + sldi 9,8,2 + cmpd 7,7,11 + add 4,4,9 + subf 9,8,11 + mtctr 9 + bgt 7,.L51 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L51 + .p2align 4,,15 +.L22: + lfs 12,0(4) + addi 4,4,4 + fabs 12,12 + fcmpu 7,0,12 + bng 7,.L21 + fmr 0,12 + mr 10,8 +.L21: + addi 8,8,1 + bdnz .L22 + li 0,-48 + addi 3,10,1 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L64: + lxvd2x 4,0,4 + addis 10,2,.LC2@toc@ha + addis 5,2,.LC3@toc@ha + std 31,-8(1) + vspltisw 2,0 + addi 10,10,.LC2@toc@l + addis 7,2,.LC4@toc@ha + addis 9,2,.LC5@toc@ha + addis 6,2,.LC6@toc@ha + lxvd2x 51,0,10 + addis 10,2,.LC7@toc@ha + addi 7,7,.LC4@toc@l + addi 9,9,.LC5@toc@l + addi 5,5,.LC3@toc@l + xvabssp 4,4 + addi 6,6,.LC6@toc@l + addi 10,10,.LC7@toc@l + lxvd2x 36,0,7 + vspltisw 18,8 + lxvd2x 37,0,9 + lxvd2x 35,0,5 + mr 9,4 + li 7,0 + lxvd2x 48,0,6 + lxvd2x 49,0,10 + vadduwm 18,18,18 + xxlor 38,51,51 + xxlor 40,4,4 + b .L6 + .p2align 4,,15 +.L65: + lxvd2x 5,0,9 + xvabssp 40,5 +.L6: + addi 5,9,16 + addi 6,9,32 + vadduwm 14,2,16 + addi 10,9,64 + addi 12,9,48 + addi 31,9,80 + addi 3,9,96 + lxvd2x 5,0,5 + lxvd2x 42,0,6 + addi 5,9,112 + addi 6,9,128 + lxvd2x 44,0,10 + lxvd2x 9,0,12 + addi 10,9,160 + addi 12,9,144 + lxvd2x 6,0,31 + lxvd2x 1,0,3 + addi 31,9,176 + addi 3,9,192 + lxvd2x 11,0,5 + lxvd2x 13,0,6 + addi 5,9,208 + addi 6,9,224 + lxvd2x 2,0,10 + lxvd2x 7,0,12 + addi 10,9,240 + lxvd2x 10,0,31 + lxvd2x 3,0,3 + xvabssp 42,42 + xvabssp 5,5 + addi 7,7,64 + lxvd2x 8,0,5 + lxvd2x 0,0,6 + xvabssp 44,44 + xvabssp 9,9 + cmpd 7,8,7 + addi 9,9,256 + lxvd2x 12,0,10 + xvabssp 6,6 + xvabssp 1,1 + xvabssp 11,11 + xvabssp 13,13 + xvabssp 7,7 + xvabssp 2,2 + xvabssp 10,10 + xvabssp 3,3 + xvabssp 8,8 + xvabssp 0,0 + xvabssp 12,12 + xvcmpgtsp 32,40,5 + xvcmpgtsp 62,42,9 + xvcmpgtsp 45,44,6 + xvcmpgtsp 63,1,11 + xvcmpgtsp 39,13,7 + xvcmpgtsp 47,2,10 + xvcmpgtsp 41,3,8 + xvcmpgtsp 33,0,12 + xxsel 5,40,5,32 + xxsel 32,38,35,32 + xxsel 9,42,9,62 + xxsel 6,44,6,45 + xxsel 11,1,11,63 + xxsel 44,38,35,45 + xxsel 7,13,7,39 + xvcmpgtsp 42,5,9 + xxsel 10,2,10,47 + xvcmpgtsp 45,6,11 + xxsel 8,3,8,41 + xxsel 63,36,37,63 + xxsel 0,0,12,33 + xvcmpgtsp 43,7,10 + xxsel 40,36,37,33 + xxsel 62,36,37,62 + xvcmpgtsp 33,8,0 + xxsel 41,38,35,41 + xxsel 39,38,35,39 + xxsel 47,36,37,47 + xxsel 9,5,9,42 + xxsel 42,32,62,42 + xxsel 12,6,11,45 + xxsel 45,44,63,45 + xxsel 11,7,10,43 + xvcmpgtsp 32,9,12 + vadduwm 13,13,18 + xxsel 43,39,47,43 + xxsel 0,8,0,33 + xxsel 33,41,40,33 + xvcmpgtsp 44,11,0 + vadduwm 1,1,18 + xxsel 12,9,12,32 + xxsel 32,42,45,32 + vadduwm 0,2,0 + vadduwm 2,2,17 + xxsel 0,11,0,44 + xxsel 33,43,33,44 + xvcmpgtsp 45,12,0 + vadduwm 1,14,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,4,0 + xxsel 51,51,32,33 + xxsel 4,4,0,33 + bgt 7,.L65 + xxsldwi 0,4,4,1 + xscvspdp 10,4 + vspltw 0,19,0 + xxsldwi 12,4,4,3 + xscvspdp 0,0 + mfvsrwz 3,51 + mfvsrwz 6,32 + vspltw 0,19,3 + xscvspdp 12,12 + xxsldwi 4,4,4,2 + mfvsrwz 7,32 + vspltw 0,19,2 + xscvspdp 4,4 + mfvsrwz 9,32 + fcmpu 7,0,10 + rldicl 10,3,0,32 + rldicl 31,6,0,32 + fmr 11,12 + rldicl 5,7,0,32 + rldicl 0,9,0,32 + beq 7,.L66 + bng 7,.L9 + fmr 0,10 + mr 10,31 +.L9: + fcmpu 7,12,4 + bne 7,.L12 + cmplw 7,7,9 + ble 7,.L13 + mr 7,9 +.L13: + rldicl 5,7,0,32 +.L14: + fcmpu 7,0,11 + beq 7,.L67 + bng 7,.L19 + fmr 0,11 + mr 10,5 +.L19: + cmpd 7,11,8 + ld 31,-8(1) + bgt 7,.L4 + b .L33 + .p2align 4,,15 +.L66: + cmplw 7,3,6 + ble 7,.L8 + mr 3,6 +.L8: + rldicl 10,3,0,32 + b .L9 + .p2align 4,,15 +.L40: + sldi 12,5,2 + li 10,0 + li 9,0 + b .L23 + .p2align 4,,15 +.L12: + bng 7,.L14 + fmr 11,4 + mr 5,0 + b .L14 + .p2align 4,,15 +.L67: + cmpd 7,10,5 + ble 7,.L19 + mr 10,5 + b .L19 +.L51: + li 9,1 + mtctr 9 + b .L22 +.L52: + li 8,1 + mtctr 8 + b .L35 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size isamin_k,.-isamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 +.LC6: + .long 32 + .long 32 + .long 32 + .long 32 +.LC7: + .long 64 + .long 64 + .long 64 + .long 64 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamin_power9.S b/kernel/power/isamin_power9.S new file mode 100644 index 000000000..0475edf46 --- /dev/null +++ b/kernel/power/isamin_power9.S @@ -0,0 +1,382 @@ + .file "isamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamin_k + .type isamin_k, @function +isamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamin_k,.-isamin_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 0,0(4) + cmpdi 7,5,1 + stxv 61,-64(1) + stxv 62,-48(1) + stxv 63,-32(1) + fabs 0,0 + beq 7,.L62 + rldicr. 6,11,0,61 + beq 0,.L40 + sldi 8,5,1 + sldi 0,5,2 + neg 12,5 + std 31,-8(1) + sldi 3,5,4 + sldi 31,5,3 + li 9,0 + li 10,0 + add 5,8,5 + add 7,4,0 + sldi 12,12,2 + sldi 5,5,2 + b .L24 + .p2align 4,,15 +.L41: + mr 10,9 +.L25: + add 7,7,3 + fmr 0,12 +.L24: + lfs 12,0(7) + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L26 + fmr 0,12 + addi 10,9,1 +.L26: + add 8,7,12 + lfsx 12,8,31 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L28 + fmr 0,12 + addi 10,9,2 +.L28: + lfsx 12,8,5 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L30 + fmr 0,12 + addi 10,9,3 +.L30: + addi 9,9,4 + cmpd 7,6,9 + ble 7,.L63 + lfsx 12,8,3 + fabs 12,12 + fcmpu 7,12,0 + blt 7,.L41 + fmr 12,0 + b .L25 + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L63: + addi 6,6,-1 + ld 31,-8(1) + srdi 6,6,2 + addi 6,6,1 + sldi 9,6,2 + mulld 6,0,6 + cmpd 7,11,9 + ble 7,.L33 +.L23: + addi 8,9,1 + sldi 6,6,2 + subf 7,9,11 + cmpd 7,8,11 + mtctr 7 + add 4,4,6 + bgt 7,.L52 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L52 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,0 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L34 + fmr 0,12 + mr 10,9 +.L34: + addi 9,9,1 + bdnz .L35 +.L33: + lxv 61,-64(1) + lxv 62,-48(1) + addi 3,10,1 + lxv 63,-32(1) + blr + .p2align 4,,15 +.L62: + rldicr. 8,11,0,57 + li 10,0 + bne 0,.L64 +.L4: + addi 7,8,1 + sldi 9,8,2 + subf 6,8,11 + cmpd 7,7,11 + mtctr 6 + add 4,4,9 + bgt 7,.L51 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L51 + .p2align 4,,15 +.L22: + lfs 12,0(4) + addi 4,4,4 + fabs 12,12 + fcmpu 7,0,12 + bng 7,.L21 + fmr 0,12 + mr 10,8 +.L21: + addi 8,8,1 + bdnz .L22 + lxv 61,-64(1) + lxv 62,-48(1) + addi 3,10,1 + lxv 63,-32(1) + blr + .p2align 4,,15 +.L64: + lxv 0,0(4) + xxspltib 47,16 + addis 6,2,.LC2@toc@ha + addis 7,2,.LC3@toc@ha + addis 10,2,.LC4@toc@ha + addis 9,2,.LC5@toc@ha + xxspltib 63,32 + xxspltib 46,64 + addi 6,6,.LC2@toc@l + addi 10,10,.LC4@toc@l + addi 7,7,.LC3@toc@l + std 31,-8(1) + addi 9,9,.LC5@toc@l + xxspltib 50,0 + vextsb2w 15,15 + lxv 48,0(6) + lxv 51,0(10) + vextsb2w 31,31 + vextsb2w 14,14 + xvabssp 4,0 + lxv 34,0(9) + lxv 49,0(7) + mr 9,4 + li 10,0 + xxlor 35,48,48 + xxlor 40,4,4 + b .L6 + .p2align 4,,15 +.L65: + lxv 0,0(9) + xvabssp 40,0 +.L6: + lxv 0,16(9) + vadduwm 29,18,31 + lxv 12,240(9) + addi 10,10,64 + addi 9,9,256 + cmpd 7,8,10 + xvabssp 5,0 + lxv 0,-224(9) + xvabssp 12,12 + xvabssp 32,0 + lxv 0,-208(9) + xvcmpgtsp 42,40,5 + xvabssp 9,0 + lxv 0,-192(9) + xxsel 5,40,5,42 + xvabssp 44,0 + lxv 0,-176(9) + xvcmpgtsp 62,32,9 + xvabssp 6,0 + lxv 0,-160(9) + xxsel 9,32,9,62 + xxsel 32,35,49,42 + xvabssp 1,0 + lxv 0,-144(9) + xxsel 62,51,34,62 + xvcmpgtsp 42,5,9 + xvcmpgtsp 37,44,6 + xvabssp 11,0 + lxv 0,-128(9) + xxsel 9,5,9,42 + xxsel 42,32,62,42 + xxsel 6,44,6,37 + xxsel 37,35,49,37 + xvabssp 13,0 + lxv 0,-112(9) + xvcmpgtsp 36,1,11 + xvabssp 7,0 + lxv 0,-96(9) + xxsel 11,1,11,36 + xxsel 36,51,34,36 + xvabssp 2,0 + lxv 0,-80(9) + xvcmpgtsp 45,6,11 + xvcmpgtsp 39,13,7 + xvabssp 10,0 + lxv 0,-64(9) + xxsel 7,13,7,39 + xxsel 39,35,49,39 + xvabssp 3,0 + lxv 0,-48(9) + xvcmpgtsp 38,2,10 + xvabssp 8,0 + lxv 0,-32(9) + xxsel 10,2,10,38 + xxsel 38,51,34,38 + xvabssp 0,0 + xvcmpgtsp 43,7,10 + xvcmpgtsp 41,3,8 + xvcmpgtsp 33,0,12 + xxsel 8,3,8,41 + xxsel 41,35,49,41 + xxsel 0,0,12,33 + xxsel 40,51,34,33 + xxsel 12,6,11,45 + xxsel 11,7,10,43 + xvcmpgtsp 33,8,0 + xxsel 45,37,36,45 + xvcmpgtsp 32,9,12 + xxsel 43,39,38,43 + vadduwm 13,13,15 + xxsel 0,8,0,33 + xxsel 33,41,40,33 + xxsel 12,9,12,32 + xxsel 32,42,45,32 + xvcmpgtsp 44,11,0 + vadduwm 1,1,15 + vadduwm 0,18,0 + vadduwm 18,18,14 + xxsel 0,11,0,44 + xxsel 33,43,33,44 + xvcmpgtsp 45,12,0 + vadduwm 1,29,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,4,0 + xxsel 48,48,32,33 + xxsel 4,4,0,33 + bgt 7,.L65 + xxsldwi 0,4,4,3 + xxsldwi 11,4,4,2 + li 9,0 + li 10,12 + xxsldwi 12,4,4,1 + xscvspdp 4,4 + vextuwrx 3,9,16 + li 9,4 + xscvspdp 0,0 + xscvspdp 11,11 + xscvspdp 12,12 + vextuwrx 6,9,16 + li 9,8 + vextuwrx 7,9,16 + vextuwrx 9,10,16 + rldicl 31,6,0,32 + rldicl 10,3,0,32 + rldicl 5,7,0,32 + rldicl 0,9,0,32 + fcmpu 7,0,11 + fmr 10,12 + beq 7,.L66 + bng 7,.L9 + mr 10,31 + fmr 0,11 +.L9: + fcmpu 7,12,4 + bne 7,.L12 + cmplw 7,7,9 + ble 7,.L13 + mr 7,9 +.L13: + rldicl 5,7,0,32 +.L14: + fcmpu 7,0,10 + beq 7,.L67 + bng 7,.L19 + mr 10,5 + fmr 0,10 +.L19: + cmpd 7,11,8 + ld 31,-8(1) + bgt 7,.L4 + b .L33 + .p2align 4,,15 +.L66: + cmplw 7,3,6 + ble 7,.L8 + mr 3,6 +.L8: + rldicl 10,3,0,32 + b .L9 + .p2align 4,,15 +.L40: + sldi 0,5,2 + li 10,0 + li 9,0 + b .L23 + .p2align 4,,15 +.L12: + bng 7,.L14 + mr 5,0 + fmr 10,4 + b .L14 + .p2align 4,,15 +.L67: + cmpd 7,10,5 + ble 7,.L19 + mr 10,5 + b .L19 +.L51: + li 9,1 + mtctr 9 + b .L22 +.L52: + li 8,1 + mtctr 8 + b .L35 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size isamin_k,.-isamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits From 6b830793686c06fed3f517ca1e95854891bbed6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 25 Sep 2019 23:13:24 +0200 Subject: [PATCH 717/935] Count cpu cores on ARMV8 and use that to pick the GEMM_PQ parameters (#2267) There is currently no simple way to query cache sizes on ARMV8, so this takes the number of cores as a trivial indication if the target is a server-class device with a big cache, or just a single-board toy or smartphone. --- cpuid_arm64.c | 30 ++++++++++++++++++++++++++++-- param.h | 31 +++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index e8aa29813..9e019fe3e 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -206,6 +206,33 @@ void get_subdirname(void) printf("arm64"); } +void get_cpucount(void) +{ +int n=0; + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("processor", buffer, 9)) + n++; + } + + fclose(infile); + + printf("#define NUM_CORES %d\n",n); +#endif + +} + + + void get_cpuconfig(void) { @@ -309,6 +336,7 @@ void get_cpuconfig(void) printf("#define DTB_SIZE 4096 \n"); break; } + get_cpucount(); } @@ -351,5 +379,3 @@ void get_features(void) #endif return; } - - diff --git a/param.h b/param.h index 5fbdbcdcd..0ff59f400 100644 --- a/param.h +++ b/param.h @@ -2636,15 +2636,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 512 -#define DGEMM_DEFAULT_P 256 -#define CGEMM_DEFAULT_P 256 -#define ZGEMM_DEFAULT_P 128 +/*FIXME: this should be using the cache size, but there is currently no easy way to +query that on ARM. So if getarch counted more than 8 cores we simply assume the host +is a big desktop or server with abundant cache rather than a phone or embedded device */ +#if NUM_CORES > 8 + #define SGEMM_DEFAULT_P 512 + #define DGEMM_DEFAULT_P 256 + #define CGEMM_DEFAULT_P 256 + #define ZGEMM_DEFAULT_P 128 + + #define SGEMM_DEFAULT_Q 1024 + #define DGEMM_DEFAULT_Q 512 + #define CGEMM_DEFAULT_Q 512 + #define ZGEMM_DEFAULT_Q 512 +#else + #define SGEMM_DEFAULT_P 128 + #define DGEMM_DEFAULT_P 160 + #define CGEMM_DEFAULT_P 128 + #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 1024 -#define DGEMM_DEFAULT_Q 512 -#define CGEMM_DEFAULT_Q 512 -#define ZGEMM_DEFAULT_Q 512 + #define SGEMM_DEFAULT_Q 352 + #define DGEMM_DEFAULT_Q 128 + #define CGEMM_DEFAULT_Q 224 + #define ZGEMM_DEFAULT_Q 112 +#endif #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 From 7f58f3ad0e10304965a6573bb11208cb6e1df446 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Sep 2019 00:44:26 +0200 Subject: [PATCH 718/935] Fix mis-edits in the gcc-derived power8 caxpy kernel --- kernel/power/caxpy_power8.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index 09a423571..0ce61ca3b 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -34,9 +34,9 @@ caxpy_k: lfs 0,4(10) fmuls 10,2,10 #ifdef CONJ - fmsubs 11,11,1,10 -#else fmadds 11,11,1,10 +#else + fmsubs 11,11,1,10 #endif fadds 12,12,11 stfs 12,0(10) @@ -241,8 +241,13 @@ caxpy_k: lfsx 12,8,5 lfsx 0,10,5 fmuls 11,2,11 +#ifdef CONJ fmsubs 12,1,12,11 fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif stfsx 0,10,5 ble 7,.L39 sldi 6,6,2 From 596a22325a1123bed772c61d298c8d14d187cfe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Sep 2019 00:47:18 +0200 Subject: [PATCH 719/935] Fix prologue of power9 assembly cdot(c) kernel to provide cdotc --- kernel/power/cdot_power9.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S index 01d194c0c..9ec7cdd85 100644 --- a/kernel/power/cdot_power9.S +++ b/kernel/power/cdot_power9.S @@ -1,10 +1,16 @@ - .file "cdot.c" +#define ASSEMBLER +#include "common.h" +/* +.file "cdot.c" .abiversion 2 .section ".text" .align 2 .p2align 4,,15 .globl cdot_k .type cdot_k, @function +*/ + PROLOGUE + cdot_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha From ede5efebabb5dbde46175b996df59c755248bf29 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Sun, 29 Sep 2019 02:27:50 +0000 Subject: [PATCH 720/935] trmm fix --- kernel/power/sgemm_logic_power9.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index 053836cbf..a34ed32b8 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -136,8 +136,8 @@ LSGEMM_L8x16_BEGIN: #endif ZERO8x16 - mtctr L ble LSGEMM_L8x16_SUB0 + mtctr L bl LSGEMM_L8x16_LMAIN_SUB andi. L, T12, 127 ble LSGEMM_L8x16_SAVE @@ -146,7 +146,7 @@ LSGEMM_L8x16_BEGIN: LSGEMM_L8x16_SUB0: #if defined(TRMMKERNEL) andi. L, T11, 255 - cmpwi T11,128 + cmpwi T11,129 #else andi. L, K, 255 cmpwi K,129 From 6355c25dde1ccba0fe6521dc0b36c0fcdddda0ef Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Sun, 29 Sep 2019 22:03:12 -0700 Subject: [PATCH 721/935] Avoid taking root of negative number in symv_thread.c This is similar to fixes in gh-1929, but there was one remaining occurance of this type of pattern in the driver/level2/*_thread.c files. --- driver/level2/symv_thread.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index ab783de2b..d7cc01768 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -166,7 +166,11 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i if (nthreads - num_cpu > 1) { double di = (double)i; - width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } if (width < 4) width = 4; if (width > m - i) width = m - i; @@ -212,9 +216,9 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i double di = (double)(m - i); if (di * di - dnum > 0) { - width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { - width = m - i; + width = m - i; } if (width < 4) width = 4; From 8617d75548ae7be8f78406c15f98e218ad89a42a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Oct 2019 23:50:41 +0200 Subject: [PATCH 722/935] Revert "Avoid taking root of negative number in symv_thread.c" --- driver/level2/symv_thread.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index d7cc01768..ab783de2b 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -166,11 +166,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i if (nthreads - num_cpu > 1) { double di = (double)i; - if (di * di - dnum > 0) { - width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; - } else { - width = m - i; - } + width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; if (width < 4) width = 4; if (width > m - i) width = m - i; @@ -216,9 +212,9 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i double di = (double)(m - i); if (di * di - dnum > 0) { - width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { - width = m - i; + width = m - i; } if (width < 4) width = 4; From ac10236cc8a7b61e3fa37741ca903ea4d990a62e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Oct 2019 22:35:34 +0200 Subject: [PATCH 723/935] Update the OSX BINARY=32 test to xcode9.2 in response to Homebrew updates --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2b1b99b26..51c55acf5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -169,7 +169,7 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos - osx_image: xcode8.3 + osx_image: xcode9.2 env: - BTYPE="BINARY=32" From 32f5907fef1b1a68a3af20278c4f3b3b54b5268b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Oct 2019 01:09:02 +0200 Subject: [PATCH 724/935] Update 32bit macOS again to xcode 9.3 os version 10.13 "High Sierra" appears to be the oldest release now for which Homebrew provides a gcc package. Anything older and the Travis job will run out of time building gcc from source --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 51c55acf5..28f95f5e2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -169,7 +169,7 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos - osx_image: xcode9.2 + osx_image: xcode9.3 env: - BTYPE="BINARY=32" From bb5413863fbf52dc5b8f2fd1b814b80c938d8c39 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Oct 2019 14:50:03 +0200 Subject: [PATCH 725/935] Rewrite ARM64 PROLOGUE to make it compatible with xcode/ios --- common_arm64.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index c6ef2fb5d..c5e6948dc 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -103,12 +103,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) -#define PROLOGUE \ - .text ;\ - .align 4 ;\ - .global REALNAME ;\ - .type REALNAME, %function ;\ +.macro PROLOGUE + .text ; + .p2align 2 ; + .global REALNAME ; + .type REALNAME, %function ; REALNAME: +.endm + #define EPILOGUE From 56837e9d92c41290b07bc924915c633e39401abb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Oct 2019 14:53:23 +0200 Subject: [PATCH 726/935] Make local labels in macro compatible with the xcode assembler ... which does not perform the automatic numbering on instantiation that the _@ suffix signifies --- kernel/arm64/nrm2.S | 19 ++++++++++--------- kernel/arm64/znrm2.S | 38 +++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index e2cbd4def..d4f0374cb 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -54,37 +54,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) ldr s4, [X], #4 fcmp s4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ + beq 2f fabs s4, s4 fcmp SCALE, s4 - bge KERNEL_F1_SCALE_GE_X_\@ + bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */ fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_X_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_X_\@: */ fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X], #8 fcmp d4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ fabs d4, d4 fcmp SCALE, d4 - bge KERNEL_F1_SCALE_GE_X_\@ + bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */ fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_X_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_X_\@: */ fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] #endif -KERNEL_F1_NEXT_\@: +2: /* KERNEL_F1_NEXT_\@: */ .endm .macro KERNEL_S1 diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S index 1c89685ea..ce3f7d4ed 100644 --- a/kernel/arm64/znrm2.S +++ b/kernel/arm64/znrm2.S @@ -54,69 +54,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) ldr s4, [X], #4 fcmp s4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ fabs s4, s4 fcmp SCALE, s4 - bge KERNEL_F1_SCALE_GE_XR_\@ + bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */ fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_XR_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_XR_\@: */ fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] -KERNEL_F1_NEXT_\@: +2: /* KERNEL_F1_NEXT_\@: */ ldr s5, [X], #4 fcmp s5, REGZERO - beq KERNEL_F1_END_\@ + beq 4f /* KERNEL_F1_END_\@ */ fabs s5, s5 fcmp SCALE, s5 - bge KERNEL_F1_SCALE_GE_XI_\@ + bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */ fdiv s2, SCALE, s5 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s5 - b KERNEL_F1_END_\@ -KERNEL_F1_SCALE_GE_XI_\@: + b 4f /* KERNEL_F1_END_\@ */ +3: /* KERNEL_F1_SCALE_GE_XI_\@: */ fdiv s2, s5, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X], #8 fcmp d4, REGZERO - beq KERNEL_F1_NEXT_\@ + beq 2f /* KERNEL_F1_NEXT_\@ */ fabs d4, d4 fcmp SCALE, d4 - bge KERNEL_F1_SCALE_GE_XR_\@ + bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */ fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 - b KERNEL_F1_NEXT_\@ -KERNEL_F1_SCALE_GE_XR_\@: + b 2f /* KERNEL_F1_NEXT_\@ */ +1: /* KERNEL_F1_SCALE_GE_XR_\@: */ fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] -KERNEL_F1_NEXT_\@: +2: /* KERNEL_F1_NEXT_\@: */ ldr d5, [X], #8 fcmp d5, REGZERO - beq KERNEL_F1_END_\@ + beq 4f /* KERNEL_F1_END_\@ */ fabs d5, d5 fcmp SCALE, d5 - bge KERNEL_F1_SCALE_GE_XI_\@ + bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */ fdiv d2, SCALE, d5 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d5 - b KERNEL_F1_END_\@ -KERNEL_F1_SCALE_GE_XI_\@: + b 4f /* KERNEL_F1_END_\@ */ +3: /* KERNEL_F1_SCALE_GE_XI_\@: */ fdiv d2, d5, SCALE fmla SSQ, d2, v2.d[0] #endif -KERNEL_F1_END_\@: +4: /* KERNEL_F1_END_\@: */ .endm .macro KERNEL_S1 From 258ac56e0aa46e9b7120bcb5635d1bba48f4c2aa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Oct 2019 10:52:47 +0200 Subject: [PATCH 727/935] Move 32bit OSX build back to xcode 8.3 but switch to gcc8 --- .travis.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 28f95f5e2..72e29091d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -162,16 +162,16 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - brew install gcc # for gfortran + - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="BINARY=64 INTERFACE64=1" + - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8" - <<: *test-macos - osx_image: xcode9.3 + osx_image: xcode8.3 env: - - BTYPE="BINARY=32" + - BTYPE="BINARY=32 FC=gfortran-8" # whitelist branches: From 3a2df19db6b9bacd88974fbf87ef5b335fa2856f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2019 08:09:26 +0200 Subject: [PATCH 728/935] Fix accidental duplication of jump instruction --- kernel/arm64/nrm2.S | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index d4f0374cb..0e5a8eed1 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -55,7 +55,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s4, [X], #4 fcmp s4, REGZERO beq 2f /* KERNEL_F1_NEXT_\@ */ - beq 2f fabs s4, s4 fcmp SCALE, s4 bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */ From a448884a63f59f54da197a7e1fe921be715ce6d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2019 08:37:50 +0200 Subject: [PATCH 729/935] Remove automatic label postfixes from macro included only once --- kernel/arm64/znrm2.S | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S index ce3f7d4ed..a530b80f0 100644 --- a/kernel/arm64/znrm2.S +++ b/kernel/arm64/znrm2.S @@ -123,69 +123,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) ldr s4, [X] fcmp s4, REGZERO - beq KERNEL_S1_NEXT_\@ + beq KERNEL_S1_NEXT fabs s4, s4 fcmp SCALE, s4 - bge KERNEL_S1_SCALE_GE_XR_\@ + bge KERNEL_S1_SCALE_GE_XR fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 - b KERNEL_S1_NEXT_\@ -KERNEL_S1_SCALE_GE_XR_\@: + b KERNEL_S1_NEXT +KERNEL_S1_SCALE_GE_XR: fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] -KERNEL_S1_NEXT_\@: +KERNEL_S1_NEXT: ldr s5, [X, #4] fcmp s5, REGZERO - beq KERNEL_S1_END_\@ + beq KERNEL_S1_END fabs s5, s5 fcmp SCALE, s5 - bge KERNEL_S1_SCALE_GE_XI_\@ + bge KERNEL_S1_SCALE_GE_XI fdiv s2, SCALE, s5 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s5 - b KERNEL_S1_END_\@ -KERNEL_S1_SCALE_GE_XI_\@: + b KERNEL_S1_END +KERNEL_S1_SCALE_GE_XI: fdiv s2, s5, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X] fcmp d4, REGZERO - beq KERNEL_S1_NEXT_\@ + beq KERNEL_S1_NEXT fabs d4, d4 fcmp SCALE, d4 - bge KERNEL_S1_SCALE_GE_XR_\@ + bge KERNEL_S1_SCALE_GE_XR fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 - b KERNEL_S1_NEXT_\@ -KERNEL_S1_SCALE_GE_XR_\@: + b KERNEL_S1_NEXT +KERNEL_S1_SCALE_GE_XR: fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] -KERNEL_S1_NEXT_\@: +KERNEL_S1_NEXT: ldr d5, [X, #8] fcmp d5, REGZERO - beq KERNEL_S1_END_\@ + beq KERNEL_S1_END fabs d5, d5 fcmp SCALE, d5 - bge KERNEL_S1_SCALE_GE_XI_\@ + bge KERNEL_S1_SCALE_GE_XI fdiv d2, SCALE, d5 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d5 - b KERNEL_S1_END_\@ -KERNEL_S1_SCALE_GE_XI_\@: + b KERNEL_S1_END +KERNEL_S1_SCALE_GE_XI: fdiv d2, d5, SCALE fmla SSQ, d2, v2.d[0] #endif -KERNEL_S1_END_\@: +KERNEL_S1_END: add X, X, INC_X .endm From f2cde2ccfb5c58a38300cf003c3edbe2a607c516 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2019 20:12:08 +0200 Subject: [PATCH 730/935] Update common_arm64.h --- common_arm64.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/common_arm64.h b/common_arm64.h index c5e6948dc..13718af5a 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -78,7 +78,17 @@ static void __inline blas_lock(volatile BLASULONG *address){ #define BLAS_LOCK_DEFINED +static __inline BLASULONG rpcc(void){ + BLASULONG ret = 0; + + __asm__ __volatile__ ("mrs %0,cntvct_el0":"=r"(ret)); + return ret; +} + +#define RPCC_DEFINED +#define RPCC64BIT + static inline int blas_quickdivide(blasint x, blasint y){ return x / y; From 5f6206fa2de4f533f003379588ca2a8b294e6c2f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2019 20:13:14 +0200 Subject: [PATCH 731/935] Simplify OSX/IOS cross-compilation and add a CI test for it (#2279) * Add automatic fixups for OSX/IOS cross-compilation * Add OSX/IOS cross-compilation test to Travis CI * Handle platforms that lack hwcap.h by falling back to ARMV8 * Fix PROLOGUE for OSX/IOS --- .travis.yml | 8 ++++++++ c_check | 13 +++++++++++++ common_arm64.h | 2 ++ driver/others/dynamic_arm64.c | 8 +++++++- 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 72e29091d..6016ec1fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -173,6 +173,14 @@ matrix: env: - BTYPE="BINARY=32 FC=gfortran-8" + - <<: *test-macos + osx_image: xcode10.1 + env: + - COMMON_FLAGS="NUM_THREADS=32" + - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" + - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang" + # whitelist branches: only: diff --git a/c_check b/c_check index 271182c54..3d82aa73c 100644 --- a/c_check +++ b/c_check @@ -260,6 +260,19 @@ if ($architecture ne $hostarch) { $cross = 1 if ($os ne $hostos); +# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS +# the initial autodetection will have been confused by the command-line arguments to clang +# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output +if (($os eq "Darwin") && ($cross_suffix ne "")) { + my $tmpnam = `xcrun --sdk iphoneos --find clang`; + $cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 ); +# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/"; + $cross =1; + $architecture = arm64; +} + + + $openmp = "" if $ENV{USE_OPENMP} != 1; $linker_L = ""; diff --git a/common_arm64.h b/common_arm64.h index c5e6948dc..f27ca8c63 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -107,7 +107,9 @@ static inline int blas_quickdivide(blasint x, blasint y){ .text ; .p2align 2 ; .global REALNAME ; +#ifndef __APPLE__ .type REALNAME, %function ; +#endif REALNAME: .endm diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index b4ce6b67d..9db9ba17d 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -37,8 +37,10 @@ /*********************************************************************/ #include "common.h" +#if (defined OS_LINUX || defined OS_ANDROID) #include #include +#endif extern gotoblas_t gotoblas_ARMV8; extern gotoblas_t gotoblas_CORTEXA57; @@ -105,13 +107,17 @@ static gotoblas_t *force_coretype(char *coretype) { static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; - + +#if (defined OS_LINUX || defined OS_ANDROID) if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { char coremsg[128]; snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); openblas_warning(1, coremsg); return NULL; } +#else + return NULL; +#endif get_cpu_ftr(MIDR_EL1, midr_el1); /* From f262031685ee8f912f8b2b1f0cdc136fec1f550c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2019 22:30:02 +0200 Subject: [PATCH 732/935] Support QEMU virtual cpu as CORE2 qemu itself claims it is a 64bit P6, which does not exist in the wild. --- cpuid_x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 8c954bf21..2181db4db 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1197,7 +1197,11 @@ int get_cpuname(void){ case 3: case 5: case 6: +#ifdef __64BIT__ + return CPUTYPE_CORE2; +#else return CPUTYPE_PENTIUM2; +#endif case 7: case 8: case 10: From e8a2aed2b9ccf4dbc78e622df408e1f77f837ab0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Oct 2019 18:24:13 +0200 Subject: [PATCH 733/935] Support QEMU cpu calling itself 64bit AMD Athlon as well Some QEMU instances pretend to be "AuthenticAMD" with the same family 6/model 6 even when running on an Intel host (could be related to qemu or libvirt version and/or kvm availability). Also fix the define to depend on __x86_64__ set by the compiler, the defines using __64BIT__ will only work for getarch_2nd. --- cpuid_x86.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 2181db4db..92c8e1b67 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1197,11 +1197,7 @@ int get_cpuname(void){ case 3: case 5: case 6: -#ifdef __64BIT__ - return CPUTYPE_CORE2; -#else return CPUTYPE_PENTIUM2; -#endif case 7: case 8: case 10: @@ -1383,8 +1379,6 @@ int get_cpuname(void){ break; case 7: // family 6 exmodel 7 switch (model) { - case 10: // Goldmont Plus - return CPUTYPE_NEHALEM; case 14: // Ice Lake if(support_avx512()) return CPUTYPE_SKYLAKEX; @@ -1431,7 +1425,11 @@ int get_cpuname(void){ case 0x5: return CPUTYPE_AMDK6; case 0x6: +#if defined(__x86_64__) || defined(__amd64__) + return CPUTYPE_BARCELONA; +#else return CPUTYPE_ATHLON; +#endif case 0xf: switch (exfamily) { case 0: @@ -1814,7 +1812,11 @@ int get_coretype(void){ case 4: case 5: case 6: +#if defined(__x86_64__) || defined(__amd64__) + return CORE_CORE2; +#else return CORE_P6; +#endif case 7: return CORE_KATMAI; case 8: @@ -2021,7 +2023,11 @@ int get_coretype(void){ if (vendor == VENDOR_AMD){ if (family <= 0x5) return CORE_80486; +#if defined(__x86_64__) || defined(__amd64__) + if (family <= 0xe) return CORE_BARCELONA; +#else if (family <= 0xe) return CORE_ATHLON; +#endif if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else if (exfamily == 5) return CORE_BOBCAT; From 844629af5702148b5eaee909472f4a80b368498d Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 02:00:34 +0800 Subject: [PATCH 734/935] Add files via upload --- kernel/x86_64/KERNEL.SKYLAKEX | 4 +- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 811 ++++++++++++++++++++++ 2 files changed, 812 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_8x8_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d61c51628..d73a47925 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,10 +7,8 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c +DGEMMKERNEL = dgemm_kernel_8x8_skylakex.c -#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c -#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c new file mode 100644 index 000000000..b4a87cbce --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -0,0 +1,811 @@ +#include "common.h" +#include +/* row-major c_block */ +/* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/ +#define INNER_KERNEL_k1m1n8 \ + "prefetcht0 384(%1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" + +#define INNER_KERNEL_k1m2n8 \ + INNER_KERNEL_k1m1n8\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m4n8 \ + INNER_KERNEL_k1m2n8\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m8n8 \ + INNER_KERNEL_k1m4n8\ + "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ + "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ + "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ + "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m1n16 \ + "prefetcht0 384(%1); prefetcht0 448(%1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m2n16 \ + INNER_KERNEL_k1m1n16\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m2n16\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m4n16\ + "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ + "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ + "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ + "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" + +#define INNER_KERNEL_k1m1n24 \ + "prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" + +#define INNER_KERNEL_k1m2n24 \ + INNER_KERNEL_k1m1n24\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" + +#define INNER_KERNEL_k1m4n24 \ + INNER_KERNEL_k1m2n24\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" + +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m4n24\ + "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ + "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ + "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ + "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" + +#define INNER_KERNELm1(nn) \ + "cmpq $1,%2;jb "#nn"3f;"\ + #nn"4:\n\t"\ + INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ + #nn"3:\n\t" + +#define INNER_KERNELm2(nn) \ + "cmpq $1,%2;jb "#nn"0f;"\ + #nn"1:\n\t"\ + INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ + #nn"0:\n\t" + +#define INNER_KERNELm4(nn) \ + "cmpq $1,%2;jb "#nn"00f;"\ + #nn"01:\n\t"\ + INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ + #nn"00:\n\t" + +#define INNER_KERNELm8(nn) \ + "cmpq $8,%2;jb "#nn"001f;"\ + #nn"008:\n\t"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\ + #nn"001:\n\t"\ + "cmpq $1,%2;jb "#nn"000f;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"001b;"\ + ""#nn"000:\n\t" + +#define INNER_INIT_m1n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8;" + +#define INNER_INIT_m2n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" + +#define INNER_INIT_m4n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" + +#define INNER_INIT_m8n8 \ + INNER_INIT_m4n8\ + "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" + +#define INNER_INIT_m1n16 INNER_INIT_m2n8 + +#define INNER_INIT_m2n16 INNER_INIT_m4n8 + +#define INNER_INIT_m4n16 INNER_INIT_m8n8 + +#define INNER_INIT_m8n16 \ + INNER_INIT_m8n8\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ + "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" + +#define INNER_INIT_m1n24 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" + +#define INNER_INIT_m2n24 \ + INNER_INIT_m1n24\ + "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" + +#define INNER_INIT_m4n24 \ + INNER_INIT_m4n16\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" + +#define INNER_INIT_m8n24 \ + INNER_INIT_m8n16\ + "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ + "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" + +#define INNER_SETINDEX \ + "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ + "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" + +#define INNER_STORE_m1n8(c1,disp) \ + "kxnorw %%k1,%%k1,%%k1;"\ + "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vaddpd %%zmm7,"#c1","#c1";"\ + "kxnorw %%k1,%%k1,%%k1;"\ + "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};" + +#define INNER_SAVE_m1n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0) + +#define INNER_SAVE_m1n16 \ + INNER_SAVE_m1n8\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0) + +#define INNER_SAVE_m1n24 \ + INNER_SAVE_m1n16\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0) + +#define INNER_SAVE_m2n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm9,8) + +#define INNER_SAVE_m2n16 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm10,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm11,8) + +#define INNER_SAVE_m2n24 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm11,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm12,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0)\ + INNER_STORE_m1n8(%%zmm13,8) + +#define INNER_PREF_8x8 \ + "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\ + "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\ + "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\ + "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\ + "subq %4,%3; subq %4,%3; subq %4,%3;" + +#define INNER_TRANS_4x8(c1,c2,c3,c4) \ + "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ + "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ + "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ + "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ + "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ + "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" + +#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + INNER_TRANS_4x8(c1,c2,c3,c4)\ + INNER_TRANS_4x8(c5,c6,c7,c8)\ + "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ + "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ + "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ + "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ + "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ + "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" + +#define INNER_STORE_4x8(c1,c2,c3,c4) \ + "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vaddpd %%zmm4,"#c1","#c1";"\ + "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vaddpd %%zmm5,"#c2","#c2";"\ + "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vaddpd %%zmm6,"#c3","#c3";"\ + "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vaddpd %%zmm7,"#c4","#c4";"\ + "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "leaq (%3,%4,4),%3;" + +#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c1","#c1"; vmovupd "#c1",(%3); vaddpd (%3,%4,1),"#c2","#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c3","#c3"; vmovupd "#c3",(%3); vaddpd (%3,%4,1),"#c4","#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c5","#c5"; vmovupd "#c5",(%3); vaddpd (%3,%4,1),"#c6","#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vaddpd (%3),"#c7","#c7"; vmovupd "#c7",(%3); vaddpd (%3,%4,1),"#c8","#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define INNER_SAVE_m4n8 \ + INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ + INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) + +#define INNER_SAVE_m4n16 \ + INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ + INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) + +#define INNER_SAVE_m4n24 \ + INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ + INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) + +#define INNER_SAVE_m8n8 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m8n16 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ + INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) + +#define INNER_SAVE_m8n24 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ + INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) + +#define COMPUTE_m1n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","cc","memory","k1");\ + c_pointer += 1;\ +} +#define COMPUTE_m2n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","cc","memory","k1");\ + c_pointer += 2;\ +} +#define COMPUTE_m4n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03),"Yk"(k01)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","cc","memory");\ + c_pointer += 4;\ +} +#define COMPUTE_m8n8 {\ + __asm__ __volatile__(\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03)\ + :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory");\ + c_pointer += 8;\ +} + +#define COMPUTE_n8 {\ + __asm__ __volatile__(\ + "movq %8,%%r14;movq %2,%%r13;"\ + "cmpq $8,%8; jb 42222f;"\ + "42221:\n\t"\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ + "42222:\n\t"\ + "cmpq $4,%8; jb 42223f;"\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "42223:\n\t"\ + "cmpq $2,%8; jb 42224f;"\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "42224:\n\t"\ + "cmpq $1,%8; jb 42225f;"\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "addq $8,%3;"\ + "42225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ + ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n16 {\ + __asm__ __volatile__(\ + "movq %8,%%r14;movq %2,%%r13;"\ + "cmpq $8,%8; jb 32222f;"\ + "32221:\n\t"\ + INNER_INIT_m8n16\ + INNER_KERNELm8(16)\ + INNER_SAVE_m8n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ + "32222:\n\t"\ + "cmpq $4,%8; jb 32223f;"\ + INNER_INIT_m4n16\ + INNER_KERNELm4(16)\ + INNER_SAVE_m4n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "32223:\n\t"\ + "cmpq $2,%8; jb 32224f;"\ + INNER_INIT_m2n16\ + INNER_KERNELm2(16)\ + INNER_SAVE_m2n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "32224:\n\t"\ + "cmpq $1,%8; jb 32225f;"\ + INNER_INIT_m1n16\ + INNER_KERNELm1(16)\ + INNER_SAVE_m1n16\ + "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ + "32225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ + :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ + ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n24 {\ + __asm__ __volatile__(\ + "movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\ + "cmpq $8,%8; jb 22222f;"\ + "22221:\n\t"\ + INNER_INIT_m8n24\ + "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\ + INNER_KERNELm8(24)\ + "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\ + INNER_SAVE_m8n24\ + "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ + "22222:\n\t"\ + "cmpq $4,%8; jb 22223f;"\ + INNER_INIT_m4n24\ + INNER_KERNELm4(24)\ + INNER_SAVE_m4n24\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "22223:\n\t"\ + "cmpq $2,%8; jb 22224f;"\ + INNER_INIT_m2n24\ + INNER_KERNELm2(24)\ + INNER_SAVE_m2n24\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "22224:\n\t"\ + "cmpq $1,%8; jb 22225f;"\ + INNER_INIT_m1n24\ + INNER_KERNELm1(24)\ + INNER_SAVE_m1n24\ + "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ + "22225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ + :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\ + "+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\ + ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ + "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\ + a_block_pointer -= M * K;\ +} + +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +//perform C += A B + if(k==0 || m==0 || ndiv8==0) return; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); + int64_t K = (int64_t)k; int64_t M = (int64_t)m; + double *a_block_pointer; + double *c_pointer = c; + __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; + BLASLONG ndiv8_count; + double *b_scratch = (double *)aligned_alloc(64,192*k); + double *packed_b_pointer = packed_b; + a_block_pointer = packed_a; + for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ + __asm__ __volatile__ ( + "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;" + "100001:\n\t" + "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;" + "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;" + "decq %2; testq %2,%2; jnz 100001b;" + "100002:\n\t" + "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;" + :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7"); + COMPUTE_n24 + } + for(;ndiv8_count>1;ndiv8_count-=2){ + __asm__ __volatile__ ( + "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;" + "1000001:\n\t" + "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;" + "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;" + "decq %2; testq %2,%2; jnz 1000001b;" + "1000002:\n\t" + "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;" + :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6"); + COMPUTE_n16 + } + if(ndiv8_count>0){ + COMPUTE_n8 + } + free(b_scratch);b_scratch=NULL; +} + +/* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */ +/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ +/* __m128d accumulators: xc1-xc4; temporary variables: xa1,xb1-xb2 */ +/* double accumulator: sc1; temporary variables: sa1,sb1 */ +/* column-major c_block */ +#define KERNEL_m8n4k1 {\ + __asm__ __volatile__(\ + "vmovupd (%0),%2; addq $64,%0;"\ + "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\ + "vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\ + "vbroadcastsd 16(%1),%3; vfmadd231pd %2,%3,%7; "\ + "vbroadcastsd 24(%1),%4; vfmadd231pd %2,%4,%8; "\ + "addq $32,%1;"\ + :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2),"+v"(zc3),"+v"(zc4)::"cc","memory");\ +} +#define KERNEL_m8n2k1 {\ + __asm__ __volatile__(\ + "vmovupd (%0),%2; addq $64,%0;"\ + "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\ + "vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\ + "addq $16,%1;"\ + :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2)::"cc","memory");\ +} +#define KERNEL_m8n1k1 {\ + __asm__ __volatile__(\ + "vmovupd (%0),%2; addq $64,%0;"\ + "vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%4; "\ + "addq $8,%1;"\ + :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zc1)::"cc","memory");\ +} +#define INIT_m8n1 zc1=_mm512_setzero_pd(); +#define INIT_m8n2 zc2=INIT_m8n1 +#define INIT_m8n4 zc4=zc3=INIT_m8n2 +#define SAVE_m8n1 {\ + za1 = _mm512_loadu_pd(c_pointer);\ + zc1 = _mm512_add_pd(zc1,za1);\ + _mm512_storeu_pd(c_pointer,zc1);\ + c_pointer += 8;\ +} +#define SAVE_m8n2 {\ + zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ + zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ + c_pointer += 8;\ +} +#define SAVE_m8n4 {\ + zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ + zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ + c_pointer += LDC*2;\ + zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ + zc3 = _mm512_add_pd(zc3,zb1); zc4 = _mm512_add_pd(zc4,zb2);\ + _mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\ + c_pointer += 8-LDC*2;\ +} +#define KERNEL_m4n4k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ + b_block_pointer+=4;\ +} +#define KERNEL_m4n2k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + b_block_pointer+=2;\ +} +#define KERNEL_m4n1k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + b_block_pointer++;\ +} +#define INIT_m4n1 yc1=_mm256_setzero_pd(); +#define INIT_m4n2 yc2=INIT_m4n1 +#define INIT_m4n4 yc4=yc3=INIT_m4n2 +#define SAVE_m4n1 {\ + ya1 = _mm256_loadu_pd(c_pointer);\ + yc1 = _mm256_add_pd(yc1,ya1);\ + _mm256_storeu_pd(c_pointer,yc1);\ + c_pointer += 4;\ +} +#define SAVE_m4n2 {\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += 4;\ +} +#define SAVE_m4n4 {\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += LDC*2;\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc3 = _mm256_add_pd(yc3,yb1); yc4 = _mm256_add_pd(yc4,yb2);\ + _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ + c_pointer += 4-LDC*2;\ +} +#define KERNEL_m2n2k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ + b_block_pointer += 2;\ +} +#define KERNEL_m2n1k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + b_block_pointer ++;\ +} +#define INIT_m2n1 xc1=_mm_setzero_pd(); +#define INIT_m2n2 xc2=INIT_m2n1 +#define SAVE_m2n1 {\ + xa1 = _mm_loadu_pd(c_pointer);\ + xc1 = _mm_add_pd(xc1,xa1);\ + _mm_storeu_pd(c_pointer,xc1);\ + c_pointer += 2;\ +} +#define SAVE_m2n2 {\ + xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ + xc1 = _mm_add_pd(xc1,xb1); xc2 = _mm_add_pd(xc2,xb2);\ + _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ + c_pointer += 2;\ +} +#define KERNEL_m1n1k1 {\ + sa1 = *a_block_pointer; a_block_pointer++;\ + sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ + b_block_pointer ++;\ +} +#define INIT_m1n1 sc1=0.0; +#define SAVE_m1n1 {\ + *c_pointer += sc1;\ + c_pointer++;\ +} + +/* row-major c_block */ +#define KERNEL_m2n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ + a_block_pointer += 2;\ +} +#define KERNEL_m1n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + a_block_pointer ++;\ +} +#define KERNEL_m1n2k1 {\ + xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ + xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + a_block_pointer ++;\ +} +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m4n1 +#define INIT_m2n4 INIT_m4n2 +#define SAVE_m2n4 {\ + yb1 = _mm256_unpacklo_pd(yc1,yc2);\ + yb2 = _mm256_unpackhi_pd(yc1,yc2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ + _mm_storeu_pd(c_pointer,xb1);\ + _mm_storeu_pd(c_pointer+LDC,xb2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ + _mm_storeu_pd(c_pointer+2*LDC,xb1);\ + _mm_storeu_pd(c_pointer+3*LDC,xb2);\ + c_pointer += 2;\ +} +#define SAVE_m1n2 {\ + *c_pointer += _mm_cvtsd_f64(xc1);\ + xa1 = _mm_unpackhi_pd(xc1,xc1);\ + c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ + c_pointer ++;\ +} +#define SAVE_m1n4 {\ + *c_pointer += _mm256_cvtsd_f64(yc1);\ + ya1 = _mm256_unpackhi_pd(yc1,yc1);\ + c_pointer[LDC] += _mm256_cvtsd_f64(ya1);\ + xb1 = _mm256_extractf128_pd(yc1,1);\ + c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ + c_pointer ++;\ +} + +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied ! + if(k==0 || m==0 || edge_n==0) return; + double *a_block_pointer,*b_block_pointer,*b_base_pointer; + double *c_pointer = c; + __m512d zb1,zb2,za1,zc1,zc2,zc3,zc4; + __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; + __m128d xc1,xc2,xa1,xb1,xb2; + double sc1,sa1,sb1; + BLASLONG m_count,n_count,k_count; + b_base_pointer = packed_b; +//now start calculation of the edge part + for(n_count=edge_n;n_count>3;n_count-=4){ + a_block_pointer = packed_a; + for(m_count=m;m_count>7;m_count-=8){ + b_block_pointer = b_base_pointer; + INIT_m8n4 + for(k_count=0;k_count3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n4 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n4 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n4 + for(k_count=0;k_count1;n_count-=2){ + a_block_pointer = packed_a; + for(m_count=m;m_count>7;m_count-=8){ + b_block_pointer = b_base_pointer; + INIT_m8n2 + for(k_count=0;k_count3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n2 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n2 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n2 + for(k_count=0;k_count0){ + a_block_pointer = packed_a; + for(m_count=m;m_count>7;m_count-=8){ + b_block_pointer = b_base_pointer; + INIT_m8n1 + for(k_count=0;k_count3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n1 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n1 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n1 + for(k_count=0;k_count7;m_count-=8){ + for(k_count=k;k_count>0;k_count--){ + tmp = _mm256_loadu_pd(src1);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+0,tmp);src1+=4; + tmp = _mm256_loadu_pd(src2);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+4,tmp);src2+=4; + dst1+=8; + } + src1+=4*k;src2+=4*k; + } + for(;m_count>0;m_count--){ + for(k_count=k;k_count>0;k_count--){ + *dst1 = (*src1) * alpha; src1++; dst1++; + } + } +} +int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ + if(m==0 || n==0 || k==0) return 0; + BLASLONG ndiv8 = n/8; + double *packed_a = (double *)malloc(m*k*sizeof(double)); + copy_4_to_8(A,packed_a,m,k,alpha); + if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8); + free(packed_a);packed_a=NULL; + return 0; +} + From 5da9484d932cd220934207f83ff111df248bba7f Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 02:01:13 +0800 Subject: [PATCH 735/935] Add files via upload --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 0ff59f400..860106991 100644 --- a/param.h +++ b/param.h @@ -1700,7 +1700,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 384 -#define DGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 128 #endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 From 6bd67ddbab5ef752e1cafca4e4b7b66ecbb57452 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 03:20:08 +0800 Subject: [PATCH 736/935] Update dgemm_kernel_8x8_skylakex.c --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 38 +---------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index b4a87cbce..69437e665 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -1,4 +1,5 @@ #include "common.h" +#include #include /* row-major c_block */ /* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/ @@ -289,43 +290,6 @@ INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) -#define COMPUTE_m1n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m1n8\ - INNER_KERNELm1(8)\ - INNER_SAVE_m1n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","cc","memory","k1");\ - c_pointer += 1;\ -} -#define COMPUTE_m2n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m2n8\ - INNER_KERNELm2(8)\ - INNER_SAVE_m2n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","cc","memory","k1");\ - c_pointer += 2;\ -} -#define COMPUTE_m4n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m4n8\ - INNER_KERNELm4(8)\ - INNER_SAVE_m4n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03),"Yk"(k01)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","cc","memory");\ - c_pointer += 4;\ -} -#define COMPUTE_m8n8 {\ - __asm__ __volatile__(\ - INNER_INIT_m8n8\ - INNER_KERNELm8(8)\ - INNER_SAVE_m8n8\ - :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03)\ - :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory");\ - c_pointer += 8;\ -} - #define COMPUTE_n8 {\ __asm__ __volatile__(\ "movq %8,%%r14;movq %2,%%r13;"\ From 9b19e9e1b01c6820aaff0c7683ac6b317c9dfaba Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 10:14:51 +0800 Subject: [PATCH 737/935] Update dgemm_kernel_8x8_skylakex.c --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 69437e665..1db955776 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -429,7 +429,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; BLASLONG ndiv8_count; - double *b_scratch = (double *)aligned_alloc(64,192*k); + double *b_scratch; + posix_memalign(&b_scratch,64,192*k); double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ @@ -637,9 +638,10 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG c_pointer ++;\ } #define SAVE_m1n4 {\ - *c_pointer += _mm256_cvtsd_f64(yc1);\ - ya1 = _mm256_unpackhi_pd(yc1,yc1);\ - c_pointer[LDC] += _mm256_cvtsd_f64(ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ xb1 = _mm256_extractf128_pd(yc1,1);\ c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ xb2 = _mm_unpackhi_pd(xb1,xb1);\ From b7315f8401089a91ae382b87be7e2683745828da Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 16 Oct 2019 19:23:36 +0800 Subject: [PATCH 738/935] Add files via upload --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 75 ++++++++--------------- 1 file changed, 26 insertions(+), 49 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 1db955776..b8b3234d1 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -25,8 +25,8 @@ "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" #define INNER_KERNEL_k1m1n16 \ - "prefetcht0 384(%1); prefetcht0 448(%1);"\ - "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; addq $64,%1;"\ "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" #define INNER_KERNEL_k1m2n16 \ @@ -46,8 +46,8 @@ "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" #define INNER_KERNEL_k1m1n24 \ - "prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\ - "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1); prefetcht0 128(%1,%%r12,2);"\ + "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; vmovupd (%1,%%r12,2),%%zmm7; addq $64,%1;"\ "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" #define INNER_KERNEL_k1m2n24 \ @@ -292,13 +292,13 @@ #define COMPUTE_n8 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 42222f;"\ "42221:\n\t"\ INNER_INIT_m8n8\ INNER_KERNELm8(8)\ INNER_SAVE_m8n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ "42222:\n\t"\ @@ -306,7 +306,7 @@ INNER_INIT_m4n8\ INNER_KERNELm4(8)\ INNER_SAVE_m4n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ "subq $4,%8;"\ "42223:\n\t"\ @@ -314,7 +314,7 @@ INNER_INIT_m2n8\ INNER_KERNELm2(8)\ INNER_SAVE_m2n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "addq $16,%3;"\ "subq $2,%8;"\ "42224:\n\t"\ @@ -322,7 +322,7 @@ INNER_INIT_m1n8\ INNER_KERNELm1(8)\ INNER_SAVE_m1n8\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "addq $8,%3;"\ "42225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ @@ -333,13 +333,13 @@ } #define COMPUTE_n16 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 32222f;"\ "32221:\n\t"\ INNER_INIT_m8n16\ INNER_KERNELm8(16)\ INNER_SAVE_m8n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ "32222:\n\t"\ @@ -347,7 +347,7 @@ INNER_INIT_m4n16\ INNER_KERNELm4(16)\ INNER_SAVE_m4n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ "subq $4,%8;"\ "32223:\n\t"\ @@ -355,7 +355,7 @@ INNER_INIT_m2n16\ INNER_KERNELm2(16)\ INNER_SAVE_m2n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ "subq $2,%8;"\ "32224:\n\t"\ @@ -363,28 +363,26 @@ INNER_INIT_m1n16\ INNER_KERNELm1(16)\ INNER_SAVE_m1n16\ - "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ "32225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ - :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ + "leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ - "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n24 {\ __asm__ __volatile__(\ - "movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 22222f;"\ "22221:\n\t"\ INNER_INIT_m8n24\ - "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\ INNER_KERNELm8(24)\ - "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\ INNER_SAVE_m8n24\ - "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ "22222:\n\t"\ @@ -392,7 +390,7 @@ INNER_INIT_m4n24\ INNER_KERNELm4(24)\ INNER_SAVE_m4n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ "subq $4,%8;"\ "22223:\n\t"\ @@ -400,7 +398,7 @@ INNER_INIT_m2n24\ INNER_KERNELm2(24)\ INNER_SAVE_m2n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ "subq $2,%8;"\ "22224:\n\t"\ @@ -408,19 +406,19 @@ INNER_INIT_m1n24\ INNER_KERNELm1(24)\ INNER_SAVE_m1n24\ - "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\ + "movq %%r13,%2; subq %%r12,%1;"\ "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ "22225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ - :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\ - "+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\ + "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\ + "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } -static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 //perform C += A B if(k==0 || m==0 || ndiv8==0) return; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); @@ -429,38 +427,17 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; BLASLONG ndiv8_count; - double *b_scratch; - posix_memalign(&b_scratch,64,192*k); double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ - __asm__ __volatile__ ( - "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;" - "100001:\n\t" - "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;" - "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;" - "decq %2; testq %2,%2; jnz 100001b;" - "100002:\n\t" - "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;" - :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7"); COMPUTE_n24 } for(;ndiv8_count>1;ndiv8_count-=2){ - __asm__ __volatile__ ( - "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;" - "1000001:\n\t" - "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;" - "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;" - "decq %2; testq %2,%2; jnz 1000001b;" - "1000002:\n\t" - "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;" - :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6"); COMPUTE_n16 } if(ndiv8_count>0){ COMPUTE_n8 } - free(b_scratch);b_scratch=NULL; } /* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */ From 6bcb06fcb1d3f2b79fbe67c79db0e0af1d76297c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 18 Oct 2019 10:47:31 +0800 Subject: [PATCH 739/935] make further changes to icopy_8 easier --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 103 +++++++++++++--------- 1 file changed, 61 insertions(+), 42 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index b8b3234d1..49facd751 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -1,8 +1,8 @@ #include "common.h" #include #include +//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. /* row-major c_block */ -/* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/ #define INNER_KERNEL_k1m1n8 \ "prefetcht0 384(%1);"\ "prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\ @@ -158,7 +158,7 @@ #define INNER_STORE_m1n8(c1,disp) \ "kxnorw %%k1,%%k1,%%k1;"\ "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\ - "vaddpd %%zmm7,"#c1","#c1";"\ + "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ "kxnorw %%k1,%%k1,%%k1;"\ "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};" @@ -227,26 +227,27 @@ "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" +//%7 for k01(input) only when m=4 #define INNER_STORE_4x8(c1,c2,c3,c4) \ - "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vaddpd %%zmm4,"#c1","#c1";"\ + "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vaddpd %%zmm5,"#c2","#c2";"\ + "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vaddpd %%zmm6,"#c3","#c3";"\ + "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vaddpd %%zmm7,"#c4","#c4";"\ + "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ "leaq (%3,%4,4),%3;" #define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c1","#c1"; vmovupd "#c1",(%3); vaddpd (%3,%4,1),"#c2","#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c3","#c3"; vmovupd "#c3",(%3); vaddpd (%3,%4,1),"#c4","#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c5","#c5"; vmovupd "#c5",(%3); vaddpd (%3,%4,1),"#c6","#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vaddpd (%3),"#c7","#c7"; vmovupd "#c7",(%3); vaddpd (%3,%4,1),"#c8","#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" + "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" #define INNER_SAVE_m4n8 \ INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ @@ -292,6 +293,7 @@ #define COMPUTE_n8 {\ __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 42222f;"\ "42221:\n\t"\ @@ -327,12 +329,13 @@ "42225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ - ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 32222f;"\ "32221:\n\t"\ @@ -369,13 +372,14 @@ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,2),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ - ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n24 {\ __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\ "cmpq $8,%8; jb 22222f;"\ "22221:\n\t"\ @@ -412,13 +416,13 @@ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\ - ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } -static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 //perform C += A B if(k==0 || m==0 || ndiv8==0) return; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); @@ -426,7 +430,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac double *a_block_pointer; double *c_pointer = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; - BLASLONG ndiv8_count; + BLASLONG m_count,ndiv8_count,k_count; double *packed_b_pointer = packed_b; a_block_pointer = packed_a; for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ @@ -474,24 +478,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m8n2 zc2=INIT_m8n1 #define INIT_m8n4 zc4=zc3=INIT_m8n2 #define SAVE_m8n1 {\ - za1 = _mm512_loadu_pd(c_pointer);\ - zc1 = _mm512_add_pd(zc1,za1);\ + __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\ + zb1 = _mm512_loadu_pd(c_pointer);\ + zc1 = _mm512_fmadd_pd(zc1,za1,zb1);\ _mm512_storeu_pd(c_pointer,zc1);\ c_pointer += 8;\ } #define SAVE_m8n2 {\ + __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\ zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ - zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\ _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ c_pointer += 8;\ } #define SAVE_m8n4 {\ + __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\ zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ - zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\ + zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\ _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\ c_pointer += LDC*2;\ zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\ - zc3 = _mm512_add_pd(zc3,zb1); zc4 = _mm512_add_pd(zc4,zb2);\ + zc3 = _mm512_fmadd_pd(zc3,za1,zb1); zc4 = _mm512_fmadd_pd(zc4,za1,zb2);\ _mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\ c_pointer += 8-LDC*2;\ } @@ -518,24 +525,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m4n2 yc2=INIT_m4n1 #define INIT_m4n4 yc4=yc3=INIT_m4n2 #define SAVE_m4n1 {\ + yb1 = _mm256_broadcast_sd(alpha);\ ya1 = _mm256_loadu_pd(c_pointer);\ - yc1 = _mm256_add_pd(yc1,ya1);\ + yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ _mm256_storeu_pd(c_pointer,yc1);\ c_pointer += 4;\ } #define SAVE_m4n2 {\ + ya1 = _mm256_broadcast_sd(alpha);\ yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ c_pointer += 4;\ } #define SAVE_m4n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ c_pointer += LDC*2;\ yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc3 = _mm256_add_pd(yc3,yb1); yc4 = _mm256_add_pd(yc4,yb2);\ + yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ c_pointer += 4-LDC*2;\ } @@ -553,14 +563,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m2n1 xc1=_mm_setzero_pd(); #define INIT_m2n2 xc2=INIT_m2n1 #define SAVE_m2n1 {\ + xb1 = _mm_loaddup_pd(alpha);\ xa1 = _mm_loadu_pd(c_pointer);\ - xc1 = _mm_add_pd(xc1,xa1);\ + xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ _mm_storeu_pd(c_pointer,xc1);\ c_pointer += 2;\ } #define SAVE_m2n2 {\ + xa1 = _mm_loaddup_pd(alpha);\ xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ - xc1 = _mm_add_pd(xc1,xb1); xc2 = _mm_add_pd(xc2,xb2);\ + xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ c_pointer += 2;\ } @@ -571,7 +583,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac } #define INIT_m1n1 sc1=0.0; #define SAVE_m1n1 {\ - *c_pointer += sc1;\ + *c_pointer += sc1 * (*alpha);\ c_pointer++;\ } @@ -596,6 +608,9 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac #define INIT_m1n4 INIT_m4n1 #define INIT_m2n4 INIT_m4n2 #define SAVE_m2n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + yc2 = _mm256_mul_pd(yc2,ya1);\ yb1 = _mm256_unpacklo_pd(yc1,yc2);\ yb2 = _mm256_unpackhi_pd(yc1,yc2);\ xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ @@ -609,12 +624,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac c_pointer += 2;\ } #define SAVE_m1n2 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xc1 = _mm_mul_pd(xc1,xb1);\ *c_pointer += _mm_cvtsd_f64(xc1);\ xa1 = _mm_unpackhi_pd(xc1,xc1);\ c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ c_pointer ++;\ } #define SAVE_m1n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ xb1 = _mm256_extractf128_pd(yc1,0);\ *c_pointer += _mm_cvtsd_f64(xb1);\ xb2 = _mm_unpackhi_pd(xb1,xb1);\ @@ -626,7 +645,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac c_pointer ++;\ } -static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8 +static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 //perform C += A B , edge_n<8 must be satisfied ! if(k==0 || m==0 || edge_n==0) return; double *a_block_pointer,*b_block_pointer,*b_base_pointer; @@ -724,30 +743,30 @@ static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG } } } -static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k,double alpha){ - BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp,alp; - src1 = src; dst1 = dst; src2 = src1 + 4 * k; alp = _mm256_set1_pd(alpha); +static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ + BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp; + src1 = src; dst1 = dst; src2 = src1 + 4 * k; for(m_count=m;m_count>7;m_count-=8){ for(k_count=k;k_count>0;k_count--){ - tmp = _mm256_loadu_pd(src1);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+0,tmp);src1+=4; - tmp = _mm256_loadu_pd(src2);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+4,tmp);src2+=4; + tmp = _mm256_loadu_pd(src1);_mm256_storeu_pd(dst1+0,tmp);src1+=4; + tmp = _mm256_loadu_pd(src2);_mm256_storeu_pd(dst1+4,tmp);src2+=4; dst1+=8; } src1+=4*k;src2+=4*k; } for(;m_count>0;m_count--){ for(k_count=k;k_count>0;k_count--){ - *dst1 = (*src1) * alpha; src1++; dst1++; + *dst1 = (*src1); src1++; dst1++; } } } int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ - if(m==0 || n==0 || k==0) return 0; - BLASLONG ndiv8 = n/8; + if(m==0 || n==0 || k==0 || alpha == 0.0) return 0; + BLASLONG ndiv8 = n/8;double ALPHA = alpha; double *packed_a = (double *)malloc(m*k*sizeof(double)); - copy_4_to_8(A,packed_a,m,k,alpha); - if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C); - if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8); + copy_4_to_8(A,packed_a,m,k); + if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); free(packed_a);packed_a=NULL; return 0; } From 17cdd9f9e17728e4d8a044aa23028aa40b5e9946 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 18 Oct 2019 14:58:07 +0800 Subject: [PATCH 740/935] some correction --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index 49facd751..bfd63bbc7 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -330,7 +330,7 @@ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ @@ -645,8 +645,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG c_pointer ++;\ } -static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 -//perform C += A B , edge_n<8 must be satisfied ! +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied. if(k==0 || m==0 || edge_n==0) return; double *a_block_pointer,*b_block_pointer,*b_base_pointer; double *c_pointer = c; @@ -763,11 +763,16 @@ static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ if(m==0 || n==0 || k==0 || alpha == 0.0) return 0; BLASLONG ndiv8 = n/8;double ALPHA = alpha; +#ifdef ICOPY_4 double *packed_a = (double *)malloc(m*k*sizeof(double)); copy_4_to_8(A,packed_a,m,k); +#else //ICOPY_8 + double *packed_a = A; +#endif if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); +#ifdef ICOPY_4 free(packed_a);packed_a=NULL; +#endif return 0; } - From 0d669e04bb2716b6a7767f0f449a2b09caf2d456 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 18 Oct 2019 15:00:17 +0800 Subject: [PATCH 741/935] Update dgemm_kernel_8x8_skylakex.c --- kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c index bfd63bbc7..1139090e2 100644 --- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c @@ -1,6 +1,8 @@ #include "common.h" #include #include + +#define ICOPY_4 //register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. /* row-major c_block */ #define INNER_KERNEL_k1m1n8 \ @@ -743,6 +745,7 @@ static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG } } } +#ifdef ICOPY_4 static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp; src1 = src; dst1 = dst; src2 = src1 + 4 * k; @@ -760,6 +763,7 @@ static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){ } } } +#endif int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){ if(m==0 || n==0 || k==0 || alpha == 0.0) return 0; BLASLONG ndiv8 = n/8;double ALPHA = alpha; From 6ff013bae0965d791841aead9c9dcb0e27e1b7be Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 19 Oct 2019 03:54:44 +0800 Subject: [PATCH 742/935] native support for icopy_4 90% MKL 1-thread performance. --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 666 ++++++++++++++++++++ 2 files changed, 667 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d73a47925..82a455b44 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,7 +7,7 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMKERNEL = dgemm_kernel_8x8_skylakex.c +DGEMMKERNEL = dgemm_kernel_4x8_skylakex_2.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c new file mode 100644 index 000000000..a958a1a6f --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -0,0 +1,666 @@ +#include "common.h" +#include +#include + +//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. +/* row-major c_block */ +#define INNER_KERNEL_k1m1n8 \ + "prefetcht0 384(%1);"\ + "vmovupd (%1),%%zmm5; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" + +#define INNER_KERNEL_k1m2n8 \ + INNER_KERNEL_k1m1n8\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m4n8 \ + INNER_KERNEL_k1m2n8\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m8n8 \ + INNER_KERNEL_k1m4n8\ + "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ + "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ + "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ + "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m1n16 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m2n16 \ + INNER_KERNEL_k1m1n16\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m2n16\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m4n16\ + "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ + "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ + "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ + "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" + +#define INNER_KERNEL_k1m1n24 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" + +#define INNER_KERNEL_k1m2n24 \ + INNER_KERNEL_k1m1n24\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" + +#define INNER_KERNEL_k1m4n24 \ + INNER_KERNEL_k1m2n24\ + "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ + "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" + +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m4n24\ + "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ + "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ + "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ + "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" + +#define INNER_KERNELm1(nn) \ + "cmpq $1,%2;jb "#nn"3f;"\ + #nn"4:\n\t"\ + INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ + #nn"3:\n\t" + +#define INNER_KERNELm2(nn) \ + "cmpq $1,%2;jb "#nn"0f;"\ + #nn"1:\n\t"\ + INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ + #nn"0:\n\t" + +#define INNER_KERNELm4(nn) \ + "cmpq $1,%2;jb "#nn"00f;"\ + #nn"01:\n\t"\ + INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ + #nn"00:\n\t" + +#define INNER_KERNELm8(nn) \ + "cmpq $8,%2;jb "#nn"001f;"\ + #nn"008:\n\t"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\ + #nn"001:\n\t"\ + "cmpq $1,%2;jb "#nn"000f;"\ + INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + "decq %2;jmp "#nn"001b;"\ + ""#nn"000:\n\t" + +#define INNER_INIT_m1n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8;" + +#define INNER_INIT_m2n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" + +#define INNER_INIT_m4n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" + +#define INNER_INIT_m8n8 \ + INNER_INIT_m4n8\ + "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" + +#define INNER_INIT_m1n16 INNER_INIT_m2n8 + +#define INNER_INIT_m2n16 INNER_INIT_m4n8 + +#define INNER_INIT_m4n16 INNER_INIT_m8n8 + +#define INNER_INIT_m8n16 \ + INNER_INIT_m8n8\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ + "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" + +#define INNER_INIT_m1n24 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" + +#define INNER_INIT_m2n24 \ + INNER_INIT_m1n24\ + "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" + +#define INNER_INIT_m4n24 \ + INNER_INIT_m4n16\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" + +#define INNER_INIT_m8n24 \ + INNER_INIT_m8n16\ + "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ + "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" + +#define INNER_SETINDEX \ + "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ + "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" + +#define INNER_STORE_m1n8(c1,disp) \ + "kxnorw %%k1,%%k1,%%k1;"\ + "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ + "kxnorw %%k1,%%k1,%%k1;"\ + "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};" + +#define INNER_SAVE_m1n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0) + +#define INNER_SAVE_m1n16 \ + INNER_SAVE_m1n8\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0) + +#define INNER_SAVE_m1n24 \ + INNER_SAVE_m1n16\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0) + +#define INNER_SAVE_m2n8 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm9,8) + +#define INNER_SAVE_m2n16 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm10,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm11,8) +#define INNER_SAVE_m2n24 \ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm11,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm12,8)\ + "leaq (%3,%4,8),%3;"\ + INNER_STORE_m1n8(%%zmm10,0)\ + INNER_STORE_m1n8(%%zmm13,8) +#define INNER_PREF_8x8 \ + "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\ + "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\ + "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\ + "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\ + "subq %4,%3; subq %4,%3; subq %4,%3;" +#define INNER_TRANS_4x8(c1,c2,c3,c4) \ + "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ + "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ + "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ + "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ + "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ + "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" +#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + INNER_TRANS_4x8(c1,c2,c3,c4)\ + INNER_TRANS_4x8(c5,c6,c7,c8)\ + "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ + "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ + "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ + "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ + "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ + "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" +//%7 for k01(input) only when m=4 +#define INNER_STORE_4x8(c1,c2,c3,c4) \ + "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ + "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ + "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ + "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ + "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ + "leaq (%3,%4,4),%3;" +#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ + "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" +#define INNER_SAVE_m4n8 \ + INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ + INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) +#define INNER_SAVE_m4n16 \ + INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ + INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ + INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) +#define INNER_SAVE_m4n24 \ + INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ + INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ + INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ + INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) +#define INNER_SAVE_m8n8 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) +#define INNER_SAVE_m8n16 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ + INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) +#define INNER_SAVE_m8n24 \ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ + INNER_PREF_8x8\ + INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ + INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) + +#define COMPUTE_n8 {\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 42222f;"\ + "42221:\n\t"\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ + "42222:\n\t"\ + "cmpq $4,%8; jb 42223f;"\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "42223:\n\t"\ + "cmpq $2,%8; jb 42224f;"\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "42224:\n\t"\ + "cmpq $1,%8; jb 42225f;"\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "42225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n16 {\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 32222f;"\ + "32221:\n\t"\ + INNER_INIT_m8n16\ + INNER_KERNELm8(16)\ + INNER_SAVE_m8n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ + "32222:\n\t"\ + "cmpq $4,%8; jb 32223f;"\ + INNER_INIT_m4n16\ + INNER_KERNELm4(16)\ + INNER_SAVE_m4n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "32223:\n\t"\ + "cmpq $2,%8; jb 32224f;"\ + INNER_INIT_m2n16\ + INNER_KERNELm2(16)\ + INNER_SAVE_m2n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "32224:\n\t"\ + "cmpq $1,%8; jb 32225f;"\ + INNER_INIT_m1n16\ + INNER_KERNELm1(16)\ + INNER_SAVE_m1n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ + "32225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n24 {\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 22222f;"\ + "22221:\n\t"\ + INNER_INIT_m8n24\ + INNER_KERNELm8(24)\ + INNER_SAVE_m8n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ + "22222:\n\t"\ + "cmpq $4,%8; jb 22223f;"\ + INNER_INIT_m4n24\ + INNER_KERNELm4(24)\ + INNER_SAVE_m4n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "subq $4,%8;"\ + "22223:\n\t"\ + "cmpq $2,%8; jb 22224f;"\ + INNER_INIT_m2n24\ + INNER_KERNELm2(24)\ + INNER_SAVE_m2n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ + "subq $2,%8;"\ + "22224:\n\t"\ + "cmpq $1,%8; jb 22225f;"\ + INNER_INIT_m1n24\ + INNER_KERNELm1(24)\ + INNER_SAVE_m1n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ + "22225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ + "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 +//perform C += A B + if(k==0 || m==0 || ndiv8==0) return; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); + int64_t K = (int64_t)k; int64_t M = (int64_t)m; + double *a_block_pointer; + double *c_pointer = c; + __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; + BLASLONG ndiv8_count; + double *packed_b_pointer = packed_b; + a_block_pointer = packed_a; + for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ + COMPUTE_n24 + } + for(;ndiv8_count>1;ndiv8_count-=2){ + COMPUTE_n16 + } + if(ndiv8_count>0){ + COMPUTE_n8 + } +} + +/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ +/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ +/* double accumulator: sc1; temporary variables: sa1,sb1 */ +/* column-major c_block */ +#define KERNEL_m4n4k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ + b_block_pointer+=4;\ +} +#define KERNEL_m4n2k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + b_block_pointer+=2;\ +} +#define KERNEL_m4n1k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + b_block_pointer++;\ +} +#define INIT_m4n1 yc1=_mm256_setzero_pd(); +#define INIT_m4n2 yc2=INIT_m4n1 +#define INIT_m4n4 yc4=yc3=INIT_m4n2 +#define SAVE_m4n1 {\ + yb1 = _mm256_broadcast_sd(alpha);\ + ya1 = _mm256_loadu_pd(c_pointer);\ + yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ + _mm256_storeu_pd(c_pointer,yc1);\ + c_pointer += 4;\ +} +#define SAVE_m4n2 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += 4;\ +} +#define SAVE_m4n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += LDC*2;\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ + c_pointer += 4-LDC*2;\ +} +#define KERNEL_m2n2k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ + b_block_pointer += 2;\ +} +#define KERNEL_m2n1k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + b_block_pointer ++;\ +} +#define INIT_m2n1 xc1=_mm_setzero_pd(); +#define INIT_m2n2 xc2=INIT_m2n1 +#define SAVE_m2n1 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xa1 = _mm_loadu_pd(c_pointer);\ + xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ + _mm_storeu_pd(c_pointer,xc1);\ + c_pointer += 2;\ +} +#define SAVE_m2n2 {\ + xa1 = _mm_loaddup_pd(alpha);\ + xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ + xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ + _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ + c_pointer += 2;\ +} +#define KERNEL_m1n1k1 {\ + sa1 = *a_block_pointer; a_block_pointer++;\ + sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ + b_block_pointer ++;\ +} +#define INIT_m1n1 sc1=0.0; +#define SAVE_m1n1 {\ + *c_pointer += sc1 * (*alpha);\ + c_pointer++;\ +} +/* row-major c_block */ +#define KERNEL_m2n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ + a_block_pointer += 2;\ +} +#define KERNEL_m1n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + a_block_pointer ++;\ +} +#define KERNEL_m1n2k1 {\ + xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ + xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + a_block_pointer ++;\ +} +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m4n1 +#define INIT_m2n4 INIT_m4n2 +#define SAVE_m2n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + yc2 = _mm256_mul_pd(yc2,ya1);\ + yb1 = _mm256_unpacklo_pd(yc1,yc2);\ + yb2 = _mm256_unpackhi_pd(yc1,yc2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ + _mm_storeu_pd(c_pointer,xb1);\ + _mm_storeu_pd(c_pointer+LDC,xb2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ + _mm_storeu_pd(c_pointer+2*LDC,xb1);\ + _mm_storeu_pd(c_pointer+3*LDC,xb2);\ + c_pointer += 2;\ +} +#define SAVE_m1n2 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xc1 = _mm_mul_pd(xc1,xb1);\ + *c_pointer += _mm_cvtsd_f64(xc1);\ + xa1 = _mm_unpackhi_pd(xc1,xc1);\ + c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ + c_pointer ++;\ +} +#define SAVE_m1n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ + xb1 = _mm256_extractf128_pd(yc1,1);\ + c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ + c_pointer ++;\ +} +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied. + if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; + double *a_block_pointer,*b_block_pointer,*b_base_pointer; + double *c_pointer = c; + __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; + __m128d xc1,xc2,xa1,xb1,xb2; + double sc1,sa1,sb1; + BLASLONG m_count,n_count,k_count; + b_base_pointer = packed_b; +//now start calculation of the edge part + for(n_count=edge_n;n_count>3;n_count-=4){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n4 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n4 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n4 + for(k_count=0;k_count1;n_count-=2){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n2 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n2 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n2 + for(k_count=0;k_count0){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n1 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n1 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n1 + for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); + return 0; +} From e9437eebd26ce9d6b4a51b5d87fda5dedf329527 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 24 Oct 2019 18:45:27 +0200 Subject: [PATCH 743/935] Restore Goldmont ID and improve QEMU support #2283 had inadvertently removed Goldmont+, and cpuid was reporting a mix of Core2 and Pentium2 for some QEMU configurations --- cpuid_x86.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 92c8e1b67..9e1c8e752 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1197,7 +1197,11 @@ int get_cpuname(void){ case 3: case 5: case 6: +#if defined(__x86_64__) || defined(__amd64__) + return CPUTYPE_CORE2; +#else return CPUTYPE_PENTIUM2; +#endif case 7: case 8: case 10: @@ -1379,6 +1383,8 @@ int get_cpuname(void){ break; case 7: // family 6 exmodel 7 switch (model) { + case 10: // Goldmont Plus + return CPUTYPE_NEHALEM; case 14: // Ice Lake if(support_avx512()) return CPUTYPE_SKYLAKEX; From 46a8c2519a85553e63f6bcfd7970731edbf9f013 Mon Sep 17 00:00:00 2001 From: luzpaz Date: Thu, 24 Oct 2019 12:56:53 -0400 Subject: [PATCH 744/935] Remove prototype of unused, unimplemented function (#2274) * Fix source typo Found via `codespell -q 3 -L amin,als,ba,dum,mone,nd,nto,orign -S Changelog.txt,./lapack*` * Remove beta-thread function per request --- common_thread.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/common_thread.h b/common_thread.h index bd964445e..6ec40e096 100644 --- a/common_thread.h +++ b/common_thread.h @@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n, int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); -int beta_thread(int mode, BLASLONG m, BLASLONG n, - double alpha_r, double alpha_i, - void *c, BLASLONG ldc, int (*fuction)()); - int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *offsetA, BLASLONG lda, void *offsetB, BLASLONG jb, From b687fba5bcb9192499d84f2d0d250230cad09407 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 24 Oct 2019 21:18:17 +0200 Subject: [PATCH 745/935] Disable direct clock register access on IOS and Android as I find conflicting information on accessibility from non-priviledged processes --- common_arm64.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index 13718af5a..376f81e60 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -78,17 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){ #define BLAS_LOCK_DEFINED +#if !defined(OS_DARWIN) && !defined (OS_ANDROID) static __inline BLASULONG rpcc(void){ BLASULONG ret = 0; - __asm__ __volatile__ ("mrs %0,cntvct_el0":"=r"(ret)); + __asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret)); return ret; } #define RPCC_DEFINED #define RPCC64BIT - +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; From fab49e49e5edbcb2b39560cd419b38faaf9694c1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 24 Oct 2019 21:26:20 +0200 Subject: [PATCH 746/935] Move most lapack 3.7/3.8 additions to the embedded_underscores list to allow linktest to pass with a compiler that adds a second underscore to such names --- exports/gensymbol | 197 +++++++++++++--------------------------------- 1 file changed, 54 insertions(+), 143 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 21a1b703d..37ba0b191 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -618,19 +618,6 @@ # functions added for lapack-3.7.0 slarfy, - slasyf_rk, - ssyconvf_rook, - ssytf2_rk, - ssytrf_rk, - ssytrs_3, - ssytri_3, - ssytri_3x, - ssycon_3, - ssysv_rk, - slasyf_aa, - ssysv_aa, - ssytrf_aa, - ssytrs_aa, strevc3, sgelqt, sgelqt3, @@ -647,33 +634,8 @@ stplqt, stplqt2, stpmlqt, - ssytrd_2stage, - ssytrd_sy2sb, - ssytrd_sb2st, - ssb2st_kernels, - ssyevd_2stage, - ssyev_2stage, - ssyevx_2stage, - ssyevr_2stage, - ssbev_2stage, - ssbevx_2stage, - ssbevd_2stage, - ssygv_2stage, dlarfy, - dlasyf_rk, dsyconvf, - dsyconvf_rook, - dsytf2_rk, - dsytrf_rk, - dsytrs_3, - dsytri_3, - dsytri_3x, - dsycon_3, - dsysv_rk, - dlasyf_aa, - dsysv_aa, - dsytrf_aa, - dsytrs_aa, dtrevc3, dgelqt, dgelqt3, @@ -690,45 +652,8 @@ dtplqt, dtplqt2, dtpmlqt, - dsytrd_2stage, - dsytrd_sy2sb, - dsytrd_sb2st, - dsb2st_kernels, - dsyevd_2stage, - dsyev_2stage, - dsyevx_2stage, - dsyevr_2stage, - dsbev_2stage, - dsbevx_2stage, - dsbevd_2stage, - dsygv_2stage, - chetf2_rk, - chetrf_rk, - chetri_3, - chetri_3x, - chetrs_3, - checon_3, - chesv_rk, - chesv_aa, - chetrf_aa, - chetrs_aa, - clahef_aa, - clahef_rk, clarfy, - clasyf_rk, - clasyf_aa, csyconvf, - csyconvf_rook, - csytf2_rk, - csytrf_rk, - csytrf_aa, - csytrs_3, - csytrs_aa, - csytri_3, - csytri_3x, - csycon_3, - csysv_rk, - csysv_aa, ctrevc3, cgelqt, cgelqt3, @@ -745,45 +670,8 @@ ctplqt, ctplqt2, ctpmlqt, - chetrd_2stage, - chetrd_he2hb, - chetrd_hb2st, - chb2st_kernels, - cheevd_2stage, - cheev_2stage, - cheevx_2stage, - cheevr_2stage, - chbev_2stage, - chbevx_2stage, - chbevd_2stage, - chegv_2stage, - zhetf2_rk, - zhetrf_rk, - zhetri_3, - zhetri_3x, - zhetrs_3, - zhecon_3, - zhesv_rk, - zhesv_aa, - zhetrf_aa, - zhetrs_aa, - zlahef_aa, - zlahef_rk, zlarfy, - zlasyf_rk, - zlasyf_aa, zsyconvf, - zsyconvf_rook, - zsytrs_aa, - zsytf2_rk, - zsytrf_rk, - zsytrf_aa, - zsytrs_3, - zsytri_3, - zsytri_3x, - zsycon_3, - zsysv_rk, - zsysv_aa, ztrevc3, ztplqt, ztplqt2, @@ -800,43 +688,13 @@ zlaswlq, zlamswlq, zgemlq, - zhetrd_2stage, - zhetrd_he2hb, - zhetrd_hb2st, - zhb2st_kernels, - zheevd_2stage, - zheev_2stage, - zheevx_2stage, - zheevr_2stage, - zhbev_2stage, - zhbevx_2stage, - zhbevd_2stage, - zhegv_2stage, sladiv1, dladiv1, iparam2stage, # functions added for lapack-3.8.0 - ilaenv2stage, - ssysv_aa_2stage, - ssytrf_aa_2stage, - ssytrs_aa_2stage, - chesv_aa_2stage, - chetrf_aa_2stage, - chetrs_aa_2stage, - csysv_aa_2stage, - csytrf_aa_2stage, - csytrs_aa_2stage, - dsysv_aa_2stage, - dsytrf_aa_2stage, - dsytrs_aa_2stage, - zhesv_aa_2stage, - zhetrf_aa_2stage, - zhetrs_aa_2stage, - zsysv_aa_2stage, - zsytrf_aa_2stage, - zsytrs_aa_2stage + ilaenv2stage ); @lapack_extendedprecision_objs = ( @@ -3509,6 +3367,59 @@ zlahef_rook, zlasyf_rook, zsytf2_rook, zsytrf_rook, zsytrs_rook, zsytri_rook, zsycon_rook, zsysv_rook, +# 3.7.0 + slasyf_rk, ssyconvf_rook, ssytf2_rk, + ssytrf_rk, ssytrs_3, ssytri_3, + ssytri_3x, ssycon_3, ssysv_rk, + slasyf_aa, ssysv_aa, ssytrf_aa, + ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb, + ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage, + ssyev_2stage, ssyevx_2stage, ssyevr_2stage, + ssbev_2stage, ssbevx_2stage, ssbevd_2stage, + ssygv_2stage, dlasyf_rk, dsyconvf_rook, + dsytf2_rk, dsytrf_rk, dsytrs_3, + dsytri_3, dsytri_3x, dsycon_3, + dsysv_rk, dlasyf_aa, dsysv_aa, + dsytrf_aa, dsytrs_aa, dsytrd_2stage, + dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels, + dsyevd_2stage, dsyev_2stage, dsyevx_2stage, + dsyevr_2stage, dsbev_2stage, dsbevx_2stage, + dsbevd_2stage, dsygv_2stage, chetf2_rk, + chetrf_rk, chetri_3, chetri_3x, + chetrs_3, checon_3, chesv_rk, + chesv_aa, chetrf_aa, chetrs_aa, + clahef_aa, clahef_rk, clasyf_rk, + clasyf_aa, csytf2_rk, csytrf_rk, + csytrf_aa, csytrs_3, csytrs_aa, + csytri_3, csytri_3x, csycon_3, + csysv_rk, csysv_aa, csyconvf_rook, + chetrd_2stage, chetrd_he2hb, chetrd_hb2st, + chb2st_kernels, cheevd_2stage, cheev_2stage, + cheevx_2stage, cheevr_2stage, chbev_2stage, + chbevx_2stage, chbevd_2stage, chegv_2stage, + zhetf2_rk, zhetrf_rk, zhetri_3, + zhetri_3x, zhetrs_3, zhecon_3, + zhesv_rk, zhesv_aa, zhetrf_aa, + zhetrs_aa, zlahef_aa, zlahef_rk, + zlasyf_rk, zlasyf_aa, zsyconvf_rook, + zsytrs_aa, zsytf2_rk, zsytrf_rk, + zsytrf_aa, zsytrs_3, zsytri_3, + zsytri_3x, zsycon_3, zsysv_rk, + zsysv_aa, zhetrd_2stage, zhetrd_he2hb, + zhetrd_hb2st, zhb2st_kernels, zheevd_2stage, + zheev_2stage, zheevx_2stage, zheevr_2stage, + zhbev_2stage, zhbevx_2stage, zhbevd_2stage, + zhegv_2stage, +# 3.8.0 + ssysv_aa_2stage, ssytrf_aa_2stage, + ssytrs_aa_2stage, chesv_aa_2stage, + chetrf_aa_2stage, chetrs_aa_2stage, + csysv_aa_2stage, csytrf_aa_2stage, + csytrs_aa_2stage, dsysv_aa_2stage, + dsytrf_aa_2stage, dsytrs_aa_2stage, + zhesv_aa_2stage, zhetrf_aa_2stage, + zhetrs_aa_2stage, zsysv_aa_2stage, + zsytrf_aa_2stage, zsytrs_aa_2stage ); From 911c3e2f4b4d557e8b65624beb90d5138289a4f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 24 Oct 2019 22:43:27 +0200 Subject: [PATCH 747/935] Improve support for g95 and non-GNU ld Auto-add "-fno-second-underscore" option to make LAPACKE compile (as it calls LAPACK functions that may have gotten a second underscore added otherwise). Also support -R for rpath when parsing compiler directives in f_check --- Makefile.system | 3 +++ f_check | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/Makefile.system b/Makefile.system index 8843d0ad3..4cb4dc954 100644 --- a/Makefile.system +++ b/Makefile.system @@ -769,6 +769,9 @@ else FCOMMON_OPT += -m32 endif endif +ifneq ($(NO_LAPACKE), 1) +FCOMMON_OPT += -fno-second-underscore +endif endif endif diff --git a/f_check b/f_check index b05db85bd..0afbab23a 100644 --- a/f_check +++ b/f_check @@ -130,6 +130,11 @@ if ($compiler eq "") { if ($data =~ / zho_ge__/) { $need2bu = 1; } + if ($vendor =~ /G95/) { + if ($ENV{NO_LAPACKE} != 1) { + $need2bu = ""; + } + } } if ($vendor eq "") { @@ -277,6 +282,8 @@ $linker_a = ""; if ($link ne "") { $link =~ s/\-Y\sP\,/\-Y/g; + + $link =~ s/\-R+/\-rpath\@/g; $link =~ s/\-rpath\s+/\-rpath\@/g; From e3e8b5cdca829e49edfc3e2d1a691e0c0ae33837 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Oct 2019 12:51:06 +0200 Subject: [PATCH 748/935] Add NetBSD --- getarch.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/getarch.c b/getarch.c index 4d960356c..1f590390a 100644 --- a/getarch.c +++ b/getarch.c @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef OS_WINDOWS #include #endif -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) #include #include #endif @@ -1201,7 +1201,7 @@ static int get_num_cores(void) { #ifdef OS_WINDOWS SYSTEM_INFO sysinfo; -#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) int m[2], count; size_t len; #endif @@ -1215,7 +1215,7 @@ static int get_num_cores(void) { GetSystemInfo(&sysinfo); return sysinfo.dwNumberOfProcessors; -#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) m[0] = CTL_HW; m[1] = HW_NCPU; len = sizeof(int); From 1b9098966242810b7b53c4a8d3009e5a38df63e9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Oct 2019 12:52:49 +0200 Subject: [PATCH 749/935] Add NetBSD to the xBSD conditionals --- driver/others/memory.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 534d6d9fc..55dce72b8 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {}; #else -#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#if defined(OS_LINUX) || defined(OS_SUNOS) #ifndef NO_AFFINITY int get_num_procs(void); #else @@ -312,7 +312,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -412,7 +412,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -436,7 +436,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1673,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) { #include #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -1736,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {}; #else -#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#if defined(OS_LINUX) || defined(OS_SUNOS) #ifndef NO_AFFINITY int get_num_procs(void); #else @@ -1855,7 +1855,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -1945,7 +1945,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1953,7 +1953,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -1977,7 +1977,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From aeabe0a83fffce9ab43ab8d10795e1696574887c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Oct 2019 22:52:30 +0200 Subject: [PATCH 750/935] Fix regex to parse -R options with and without whitespace Both forms are seen on NetBSD (#2288) --- f_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/f_check b/f_check index 0afbab23a..993ad9a35 100644 --- a/f_check +++ b/f_check @@ -19,7 +19,7 @@ $nofortran = 0; $compiler = join(" ", @ARGV); $compiler_bin = shift(@ARGV); - + # f77 is too ambiguous $compiler = "" if $compiler eq "f77"; @@ -283,7 +283,7 @@ if ($link ne "") { $link =~ s/\-Y\sP\,/\-Y/g; - $link =~ s/\-R+/\-rpath\@/g; + $link =~ s/\-R\s*/\-rpath\@/g; $link =~ s/\-rpath\s+/\-rpath\@/g; From 85ccdce8c4bfeb3f8de6dd939317631c52f1cca7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Oct 2019 23:02:37 +0200 Subject: [PATCH 751/935] Remove the IOS fallbacks to generic C kernels --- kernel/arm64/KERNEL.ARMV8 | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index a2a435738..efc1ec8bc 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -91,12 +91,10 @@ IDAMAXKERNEL = iamax.S ICAMAXKERNEL = izamax.S IZAMAXKERNEL = izamax.S -ifneq ($(OS_DARWIN)$(CROSS),11) SNRM2KERNEL = nrm2.S DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S -endif DDOTKERNEL = dot.S SDOTKERNEL = dot.S @@ -104,38 +102,6 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S -ifeq ($(OS_DARWIN)$(CROSS),11) - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -else SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -202,5 +168,3 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -endif From df857551c0d7062ebe03c9600de97fcf0620e0c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Oct 2019 23:07:00 +0200 Subject: [PATCH 752/935] Remove special parameter set for obsolete IOS/ARMV8 workaround --- param.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/param.h b/param.h index 860106991..238089e60 100644 --- a/param.h +++ b/param.h @@ -2588,38 +2588,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 -// Darwin / Cross -#if defined(OS_DARWIN) && defined(CROSS) - -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 - -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 - -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 - -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 - -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 96 -#define ZGEMM_DEFAULT_P 64 - -#define SGEMM_DEFAULT_Q 240 -#define DGEMM_DEFAULT_Q 120 -#define CGEMM_DEFAULT_Q 120 -#define ZGEMM_DEFAULT_Q 120 - -#define SGEMM_DEFAULT_R 12288 -#define DGEMM_DEFAULT_R 8192 -#define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 - -#else // Linux / Native - #if defined(CORTEXA53) || defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ defined(FALKOR) || defined(TSV110) @@ -2755,8 +2723,6 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #endif // Cores -#endif // Linux / Darwin - #endif // ARMv8 #if defined(ARMV5) From 8691825944521a6706988b12daa2a6c77cba4f53 Mon Sep 17 00:00:00 2001 From: "k.dunikowski" Date: Mon, 28 Oct 2019 08:51:05 +0100 Subject: [PATCH 753/935] Fixed a minor cmake problem, occuring when DYNAMIC_CORE=ON and CMAKE_C_FLAGS was empty --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5a7434551..f3ae84fe0 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -73,7 +73,7 @@ if (DYNAMIC_ARCH) endif () if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) - string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) + string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) From 274ff5cdb884f869c8cb99afceb56b1e8f59f87f Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 1 Nov 2019 23:59:18 +0800 Subject: [PATCH 754/935] update sgemm_q on skylakex cpus --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 860106991..198839b4f 100644 --- a/param.h +++ b/param.h @@ -1699,7 +1699,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 128 #else -#define SGEMM_DEFAULT_Q 384 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 128 #endif #define CGEMM_DEFAULT_Q 192 From 1df9a2013d4793af36064a2ccb1bf147ca9a17c2 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 2 Nov 2019 00:00:48 +0800 Subject: [PATCH 755/935] new sgemm kernel for skylakex --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 871 +++++++++++++++++++ 2 files changed, 872 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 82a455b44..a39030c53 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,6 +1,6 @@ include $(KERNELDIR)/KERNEL.HASWELL -SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c +SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_skylakex.c diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c new file mode 100644 index 000000000..79c70e4f6 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -0,0 +1,871 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ +/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ + +#include "common.h" +#include + +/* m = 16 */ /* zmm8-zmm31 for accumulators, zmm1-zmm7 for temporary use, zmm0 for alpha */ +#define KERNEL_k1m16n1 \ + "vmovups (%0),%%zmm4; addq $64,%0;"\ + "vbroadcastss (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8;"\ + "addq $4,%1;" +#define KERNEL_h_k1m16n2 \ + "vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\ + "vbroadcastsd (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8; vfmadd231ps %%zmm5,%%zmm6,%%zmm9;" +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;" +#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;" +#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" +#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\ + "vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";" +#define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1) +#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;" +#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) +#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%1;" +#define KERNEL_h_k1m16n16 KERNEL_k1m16n12 unit_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15) +#define KERNEL_k1m16n16 KERNEL_h_k1m16n16 "addq $16,%%r15;" +#define KERNEL_h_k1m16n20 KERNEL_h_k1m16n16 unit_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1) +#define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;" +#define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2) +#define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;" +#define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;" +#define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;" +#define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" +#define unit_init_m16n4(c1,c2,c3,c4) \ + "vpxorq "#c1","#c1","#c1";vpxorq "#c2","#c2","#c2";vpxorq "#c3","#c3","#c3";vpxorq "#c4","#c4","#c4";" +#define INIT_m16n8 INIT_m16n4 unit_init_m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15) +#define INIT_m16n12 INIT_m16n8 unit_init_m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19) +#define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23) +#define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27) +#define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31) +#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" +#define unit_save_m16n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ + "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\ + "prefetcht1 127(%5); prefetcht1 127(%5,%3,1);"\ + "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9) +#define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11) +#define SAVE_h_m16n8 SAVE_h_m16n4 unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15) +#define SAVE_h_m16n12 SAVE_h_m16n8 unit_save_m16n2(%%zmm16,%%zmm17) unit_save_m16n2(%%zmm18,%%zmm19) +#define SAVE_h_m16n16 SAVE_h_m16n12 unit_save_m16n2(%%zmm20,%%zmm21) unit_save_m16n2(%%zmm22,%%zmm23) +#define SAVE_h_m16n20 SAVE_h_m16n16 unit_save_m16n2(%%zmm24,%%zmm25) unit_save_m16n2(%%zmm26,%%zmm27) +#define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31) +#define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;" +#define COMPUTE_m16(ndim) \ + INIT_m16n##ndim\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + "cmpq $4,%4; jb "#ndim"016162f;"\ + #ndim"016161:\n\t"\ + KERNEL_k1m16n##ndim\ + KERNEL_k1m16n##ndim\ + KERNEL_k1m16n##ndim\ + KERNEL_k1m16n##ndim\ + "subq $4,%4; cmpq $4,%4; jnb "#ndim"016161b;"\ + #ndim"016162:\n\t"\ + "testq %4,%4; jz "#ndim"016163f;"\ + KERNEL_k1m16n##ndim\ + "decq %4; jmp "#ndim"016162b;"\ + #ndim"016163:\n\t"\ + SAVE_m16(ndim) + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1(b_addr) \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss ("#b_addr"),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,"#b_addr";" +#define KERNEL_h_k1m8n2(b_addr) \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd ("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2(b_addr) KERNEL_h_k1m8n2(b_addr) "addq $8,"#b_addr";" +#define KERNEL_h_k1m8n4(b_addr) \ + KERNEL_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4(b_addr) KERNEL_h_k1m8n4(b_addr) "addq $16,"#b_addr";" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8(b_addr) KERNEL_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1) +#define KERNEL_k1m8n8(b_addr) KERNEL_h_k1m8n8(b_addr) "addq $16,"#b_addr";" +#define KERNEL_h_k1m8n12(b_addr) KERNEL_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2) +#define KERNEL_k1m8n12(b_addr) KERNEL_h_k1m8n12(b_addr) "addq $16,"#b_addr";" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\ + "vunpcklpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5), %%ymm0,%%ymm1;vmovups %%ymm1,(%5);"\ + "vunpckhpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1;vmovups %%ymm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_L_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_L_m8n4 SAVE_L_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_L_m8n8 SAVE_L_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_L_m8n12 SAVE_L_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define SAVE_R_m8n4 unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_R_m8n8 SAVE_R_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_R_m8n12 SAVE_R_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define COMPUTE_L_m8(ndim,sim) \ + INIT_m8n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim""#sim"882:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"883f;"\ + KERNEL_k1m8n##ndim(%1)\ + "decq %4; jmp "#ndim""#sim"882b;"\ + #ndim""#sim"883:\n\t"\ + SAVE_L_m8n##ndim "addq $32,%2;" +#define COMPUTE_R_m8(ndim,sim) \ + "subq %%r12,%0; subq %%r12,%0;"\ + INIT_m8n##ndim\ + "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ + #ndim""#sim"882:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"883f;"\ + KERNEL_k1m8n##ndim(%%r15)\ + "decq %4; jmp "#ndim""#sim"882b;"\ + #ndim""#sim"883:\n\t"\ + SAVE_R_m8n##ndim +#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833) +#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833) +#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833) +#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833) +#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833) +#define COMPUTE_m8_n16 COMPUTE_L_m8(12,33733) COMPUTE_R_m8(4,33933) +#define COMPUTE_m8_n20 COMPUTE_L_m8(12,33633) COMPUTE_R_m8(8,33933) +#define COMPUTE_m8_n24 COMPUTE_L_m8(12,33533) COMPUTE_R_m8(12,33933) +#define COMPUTE_m8(ndim) COMPUTE_m8_n##ndim + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1(b_addr) \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,"#b_addr";" +#define KERNEL_h_k1m4n2(b_addr) \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup ("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2(b_addr) KERNEL_h_k1m4n2(b_addr) "addq $8,"#b_addr";" +#define KERNEL_h_k1m4n4(b_addr) \ + KERNEL_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4(b_addr) KERNEL_h_k1m4n4(b_addr) "addq $16,"#b_addr";" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8(b_addr) KERNEL_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1) +#define KERNEL_k1m4n8(b_addr) KERNEL_h_k1m4n8(b_addr) "addq $16,"#b_addr";" +#define KERNEL_h_k1m4n12(b_addr) KERNEL_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2) +#define KERNEL_k1m4n12(b_addr) KERNEL_h_k1m4n12(b_addr) "addq $16,"#b_addr";" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\ + "vunpcklpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5), %%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\ + "vunpckhpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5,%3,1),%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_L_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_L_m4n4 SAVE_L_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_L_m4n8 SAVE_L_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_L_m4n12 SAVE_L_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) +#define SAVE_R_m4n4 unit_save_m4n2(%%xmm4,%%xmm5) unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_R_m4n8 SAVE_R_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_R_m4n12 SAVE_R_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) +#define COMPUTE_L_m4(ndim,sim) \ + INIT_m4n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim""#sim"442:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"443f;"\ + KERNEL_k1m4n##ndim(%1)\ + "decq %4; jmp "#ndim""#sim"442b;"\ + #ndim""#sim"443:\n\t"\ + SAVE_L_m4n##ndim "addq $16,%2;" +#define COMPUTE_R_m4(ndim,sim) \ + "subq %%r12,%0;"\ + INIT_m4n##ndim\ + "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ + #ndim""#sim"442:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"443f;"\ + KERNEL_k1m4n##ndim(%%r15)\ + "decq %4; jmp "#ndim""#sim"442b;"\ + #ndim""#sim"443:\n\t"\ + SAVE_R_m4n##ndim +#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855) +#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855) +#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855) +#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855) +#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855) +#define COMPUTE_m4_n16 COMPUTE_L_m4(12,55755) COMPUTE_R_m4(4,55955) +#define COMPUTE_m4_n20 COMPUTE_L_m4(12,55655) COMPUTE_R_m4(8,55955) +#define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955) +#define COMPUTE_m4(ndim) COMPUTE_m4_n##ndim + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1(b_addr) \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,"#b_addr";" +#define SAVE_L_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2(b_addr) \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,"#b_addr";" +#define SAVE_L_m2n2 SAVE_L_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4(b_addr) \ + "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8(b_addr) \ + "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12(b_addr) \ + "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_L_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_L_m2n8 SAVE_L_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_L_m2n12 SAVE_L_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define SAVE_R_m2n4 unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_R_m2n8 SAVE_R_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_R_m2n12 SAVE_R_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define COMPUTE_L_m2(ndim,sim) \ + INIT_m2n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim""#sim"222:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"223f;"\ + KERNEL_k1m2n##ndim(%1)\ + "decq %4; jmp "#ndim""#sim"222b;"\ + #ndim""#sim"223:\n\t"\ + SAVE_L_m2n##ndim "addq $8,%2;" +#define COMPUTE_R_m2(ndim,sim) \ + "salq $3,%%r13;subq %%r13,%0;sarq $3,%%r13;"\ + INIT_m2n##ndim\ + "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ + #ndim""#sim"222:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"223f;"\ + KERNEL_k1m2n##ndim(%%r15)\ + "decq %4; jmp "#ndim""#sim"222b;"\ + #ndim""#sim"223:\n\t"\ + SAVE_R_m2n##ndim +#define COMPUTE_m2_n1 COMPUTE_L_m2(1,77877) +#define COMPUTE_m2_n2 COMPUTE_L_m2(2,77877) +#define COMPUTE_m2_n4 COMPUTE_L_m2(4,77877) +#define COMPUTE_m2_n8 COMPUTE_L_m2(8,77877) +#define COMPUTE_m2_n12 COMPUTE_L_m2(12,77877) +#define COMPUTE_m2_n16 COMPUTE_L_m2(12,77777) COMPUTE_R_m2(4,77977) +#define COMPUTE_m2_n20 COMPUTE_L_m2(12,77677) COMPUTE_R_m2(8,77977) +#define COMPUTE_m2_n24 COMPUTE_L_m2(12,77577) COMPUTE_R_m2(12,77977) +#define COMPUTE_m2(ndim) COMPUTE_m2_n##ndim + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1(b_addr) \ + "vmovss ("#b_addr"),%%xmm3; addq $4,"#b_addr";"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_L_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2(b_addr) \ + "vmovsd ("#b_addr"),%%xmm3; addq $8,"#b_addr";"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_L_m1n2 \ + "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4(b_addr) \ + "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8(b_addr) \ + "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12(b_addr) \ + "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_L_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_L_m1n8 SAVE_L_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_L_m1n12 SAVE_L_m1n8 unit_save_m1n4(%%xmm6) +#define SAVE_R_m1n4 unit_save_m1n4(%%xmm4) +#define SAVE_R_m1n8 SAVE_R_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_R_m1n12 SAVE_R_m1n8 unit_save_m1n4(%%xmm6) +#define COMPUTE_L_m1(ndim,sim) \ + INIT_m1n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim""#sim"112:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"113f;"\ + KERNEL_k1m1n##ndim(%1)\ + "decq %4; jmp "#ndim""#sim"112b;"\ + #ndim""#sim"113:\n\t"\ + SAVE_L_m1n##ndim "addq $4,%2;" +#define COMPUTE_R_m1(ndim,sim) \ + "salq $2,%%r13;subq %%r13,%0;sarq $2,%%r13;"\ + INIT_m1n##ndim\ + "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ + #ndim""#sim"112:\n\t"\ + "testq %4,%4; jz "#ndim""#sim"113f;"\ + KERNEL_k1m1n##ndim(%%r15)\ + "decq %4; jmp "#ndim""#sim"112b;"\ + #ndim""#sim"113:\n\t"\ + SAVE_R_m1n##ndim +#define COMPUTE_m1_n1 COMPUTE_L_m1(1,99899) +#define COMPUTE_m1_n2 COMPUTE_L_m1(2,99899) +#define COMPUTE_m1_n4 COMPUTE_L_m1(4,99899) +#define COMPUTE_m1_n8 COMPUTE_L_m1(8,99899) +#define COMPUTE_m1_n12 COMPUTE_L_m1(12,99899) +#define COMPUTE_m1_n16 COMPUTE_L_m1(12,99799) COMPUTE_R_m1(4,99999) +#define COMPUTE_m1_n20 COMPUTE_L_m1(12,99699) COMPUTE_R_m1(8,99999) +#define COMPUTE_m1_n24 COMPUTE_L_m1(12,99599) COMPUTE_R_m1(12,99999) +#define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ +/* %6 = "+r"(&alpha), %7 = "+r"(M) */ +/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "vbroadcastss (%6),%%zmm0;"\ + "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $16,%7;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m16(ndim)\ + "subq $16,%7;cmpq $16,%7;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $8,%7;jb 33102"#ndim"f;"\ + COMPUTE_m8(ndim)\ + "subq $8,%7;"\ + "33102"#ndim":\n\t"\ + "cmpq $4,%7;jb 33103"#ndim"f;"\ + COMPUTE_m4(ndim)\ + "subq $4,%7;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%7;jb 33104"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%7;"\ + "33104"#ndim":\n\t"\ + "testq %7,%7;jz 33105"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M)\ + ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ + "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ + "cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K;c_pointer += LDC * ndim - M;\ +} +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA; + for(;n_count>23;n_count-=24) COMPUTE(24) + for(;n_count>19;n_count-=20) COMPUTE(20) + for(;n_count>15;n_count-=16) COMPUTE(16) + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} + +#include +/* codes below are copied from the sgemm kernel written by Arjan van der Ven */ + +/* + * "Direct sgemm" code. This code operates directly on the inputs and outputs + * of the sgemm call, avoiding the copies, memory realignments and threading, + * and only supports alpha = 1 and beta = 0. + * This is a common case and provides value for relatively small matrixes. + * For larger matrixes the "regular" sgemm code is superior, there the cost of + * copying/shuffling the B matrix really pays off. + */ + + + +#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)]) +#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M) + + +#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps() +#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)]) +#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M) + +#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps() +#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)]) +#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M) + +#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0; +#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)]; +#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N]; +#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; +#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; + +int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) +{ + int mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + + +void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{ + int i, j, k; + + int m4 = M & ~3; + int m2 = M & ~1; + + int n64 = N & ~63; + int n32 = N & ~31; + int n16 = N & ~15; + int n8 = N & ~7; + int n4 = N & ~3; + int n2 = N & ~1; + + i = 0; + + for (i = 0; i < m4; i+=4) { + + for (j = 0; j < n64; j+= 64) { + k = 0; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + DECLARE_RESULT_256(0, 2); + DECLARE_RESULT_256(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + BROADCAST_LOAD_A_256(x, 2); + BROADCAST_LOAD_A_256(x, 3); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + MATMUL_256(0, 2); + MATMUL_256(0, 3); + } + STORE_256(0, 0); + STORE_256(0, 1); + STORE_256(0, 2); + STORE_256(0, 3); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + DECLARE_RESULT_128(0, 2); + DECLARE_RESULT_128(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + BROADCAST_LOAD_A_128(x, 2); + BROADCAST_LOAD_A_128(x, 3); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + MATMUL_128(0, 2); + MATMUL_128(0, 3); + } + STORE_128(0, 0); + STORE_128(0, 1); + STORE_128(0, 2); + STORE_128(0, 3); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2); + DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + BROADCAST_LOAD_A_SCALAR(x, 2); + BROADCAST_LOAD_A_SCALAR(x, 3); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2); + MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + STORE_SCALAR(0, 2); STORE_SCALAR(1, 2); + STORE_SCALAR(0, 3); STORE_SCALAR(1, 3); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0) + DECLARE_RESULT_SCALAR(0, 1) + DECLARE_RESULT_SCALAR(0, 2) + DECLARE_RESULT_SCALAR(0, 3) + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + BROADCAST_LOAD_A_SCALAR(0, 2); + BROADCAST_LOAD_A_SCALAR(0, 3); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + MATMUL_SCALAR(0, 2); + MATMUL_SCALAR(0, 3); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + STORE_SCALAR(0, 2); + STORE_SCALAR(0, 3); + } + } + + for (; i < m2; i+=2) { + j = 0; + + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + } + STORE_256(0, 0); + STORE_256(0, 1); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + } + STORE_128(0, 0); + STORE_128(0, 1); + } + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + DECLARE_RESULT_SCALAR(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + } + } + + for (; i < M; i+=1) { + j = 0; + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + LOAD_B_256(0, x); + MATMUL_256(0, 0); + } + STORE_256(0, 0); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + LOAD_B_128(0, x); + MATMUL_128(0, 0); + } + STORE_128(0, 0); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0); + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + LOAD_B_SCALAR(0, 0); + MATMUL_SCALAR(0, 0); + } + STORE_SCALAR(0, 0); + } + } +} From ae43b75a6a12d17a6ad769c64a306198a2b032b1 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 2 Nov 2019 10:09:19 +0800 Subject: [PATCH 757/935] Add files via upload --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 198839b4f..aa7cede0c 100644 --- a/param.h +++ b/param.h @@ -1696,7 +1696,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_P 256 #ifdef WINDOWS_ABI -#define SGEMM_DEFAULT_Q 320 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 192 From 928fe1b28e91aa55748bfc7d2abf2ed2786d3ef5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Nov 2019 22:37:27 +0100 Subject: [PATCH 758/935] The assembly microkernel is not safe to use on ELFv1 --- kernel/power/idamax.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 5bdc0a13c..337fa54f8 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -324,6 +324,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { +#if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -32; if (n1 > 0) { @@ -331,7 +332,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } - +#endif while (i < n) { if (ABS(x[i]) > maxf) { max = i; From d999688d1a7a78ba8c69eedb8f26945aa1f04baf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Nov 2019 22:39:06 +0100 Subject: [PATCH 759/935] The assembly microkernel is not safe to use on ELFv1 --- kernel/power/idamin.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 7fe0f8a33..85dd49ac1 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -326,13 +326,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf = ABS(x[0]); //index's not incremented if (inc_x == 1) { +#if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -32; if (n1 > 0) { min = diamin_kernel_32(n1, x, &minf); i = n1; } - +#endif + while (i < n) { if (ABS(x[i]) < minf) { min = i; From d2a628554921577f7353256c5888fc09dc3ccdae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Nov 2019 22:41:19 +0100 Subject: [PATCH 760/935] The assembly microkernel is not safe to use on ELFv1 --- kernel/power/izamin.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 1ffa3ba8b..8da2189c6 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -314,6 +314,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { minf = CABS1(x,0); //index will not be incremented + +#if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -16; if (n1 > 0) { @@ -321,7 +323,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } - +#endif while(i < n) { From 68597002ea1342e370c892cddb8845db61936e4f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Nov 2019 22:42:46 +0100 Subject: [PATCH 761/935] The assembly microkernel is not safe to use on ELFv1 --- kernel/power/izamax.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index cfe78c8c0..3c132f81a 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -316,6 +316,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { +#if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -16; if (n1 > 0) { @@ -323,6 +324,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif while(i < n) { From 6fa89b06a1f480065d29bfcafaedcae477ef8206 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Nov 2019 22:55:31 +0100 Subject: [PATCH 762/935] Use the two-operand form of DCBT on all PPC970 regardless of OS There seems to be no advantage to the three-operand form used in the earliest GotoBLAS kernels, and it causes compilation problems on other than the previously special-cased platforms as well --- common_power.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_power.h b/common_power.h index 5e15b7554..bcfc209a9 100644 --- a/common_power.h +++ b/common_power.h @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) ) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) #define DCBT_ARG 0 #else #define DCBT_ARG 8 From fbacd2605dd67f86f6c097b7b738138dd76913fa Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 4 Nov 2019 19:37:19 +0800 Subject: [PATCH 763/935] optimizations via software prefetches --- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 79c70e4f6..3646c7dda 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -42,7 +42,6 @@ #define unit_save_m16n2(c1,c2) \ "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\ - "prefetcht1 127(%5); prefetcht1 127(%5,%3,1);"\ "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" #define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9) #define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11) @@ -54,19 +53,25 @@ #define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;" #define COMPUTE_m16(ndim) \ INIT_m16n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ - "cmpq $4,%4; jb "#ndim"016162f;"\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ + "cmpq $16,%4; jb "#ndim"016162f;"\ #ndim"016161:\n\t"\ KERNEL_k1m16n##ndim\ KERNEL_k1m16n##ndim\ + "prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\ KERNEL_k1m16n##ndim\ KERNEL_k1m16n##ndim\ - "subq $4,%4; cmpq $4,%4; jnb "#ndim"016161b;"\ + "prefetcht1 (%8); addq $"#ndim",%8;"\ + "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\ + "movq %2,%5;"\ #ndim"016162:\n\t"\ "testq %4,%4; jz "#ndim"016163f;"\ + "prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1);"\ KERNEL_k1m16n##ndim\ + "leaq (%5,%3,2),%5;"\ "decq %4; jmp "#ndim"016162b;"\ #ndim"016163:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ SAVE_m16(ndim) /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ @@ -350,10 +355,11 @@ #define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ -/* %6 = "+r"(&alpha), %7 = "+r"(M) */ +/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */ /* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ #define COMPUTE(ndim) {\ + next_b = b_pointer + ndim * K;\ __asm__ __volatile__(\ "vbroadcastss (%6),%%zmm0;"\ "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ @@ -378,7 +384,7 @@ COMPUTE_m1(ndim)\ "33105"#ndim":\n\t"\ "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M)\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M),"+r"(next_b)\ ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ "cc","memory");\ @@ -391,7 +397,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha; int64_t M = (int64_t)m, K = (int64_t)k; BLASLONG n_count = n; - float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA,*next_b = B; for(;n_count>23;n_count-=24) COMPUTE(24) for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) From 430c11e1357b78f6a2872ea48ef6e71989488386 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 4 Nov 2019 20:10:12 +0800 Subject: [PATCH 764/935] Add files via upload --- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 3646c7dda..5d491237b 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -54,15 +54,17 @@ #define COMPUTE_m16(ndim) \ INIT_m16n##ndim\ "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ - "cmpq $16,%4; jb "#ndim"016162f;"\ + "cmpq $18,%4; jb "#ndim"016162f;"\ #ndim"016161:\n\t"\ KERNEL_k1m16n##ndim\ KERNEL_k1m16n##ndim\ + KERNEL_k1m16n##ndim\ "prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\ KERNEL_k1m16n##ndim\ KERNEL_k1m16n##ndim\ - "prefetcht1 (%8); addq $"#ndim",%8;"\ - "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\ + KERNEL_k1m16n##ndim\ + "prefetcht1 (%8); addq $32,%8;"\ + "subq $6,%4; cmpq $18,%4; jnb "#ndim"016161b;"\ "movq %2,%5;"\ #ndim"016162:\n\t"\ "testq %4,%4; jz "#ndim"016163f;"\ From 836c414e22a52b8fe2a4c714d9711ac8aa204b0c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 5 Nov 2019 13:36:56 +0800 Subject: [PATCH 765/935] optimizations of software prefetching --- kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 126 +++++++++++--------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index a958a1a6f..72878acfd 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -88,20 +88,21 @@ "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ #nn"00:\n\t" +/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ #define INNER_KERNELm8(nn) \ - "cmpq $8,%2;jb "#nn"001f;"\ + "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ #nn"008:\n\t"\ INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\ + "prefetcht1 (%11); addq $16,%11;"\ + "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ + "movq %3,%10;"\ #nn"001:\n\t"\ "cmpq $1,%2;jb "#nn"000f;"\ + "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ "decq %2;jmp "#nn"001b;"\ ""#nn"000:\n\t" @@ -158,53 +159,53 @@ #define INNER_STORE_m1n8(c1,disp) \ "kxnorw %%k1,%%k1,%%k1;"\ - "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\ "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ "kxnorw %%k1,%%k1,%%k1;"\ - "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};" + "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};" #define INNER_SAVE_m1n8 \ + "movq %3,%10;"\ INNER_SETINDEX\ INNER_STORE_m1n8(%%zmm8,0) #define INNER_SAVE_m1n16 \ INNER_SAVE_m1n8\ - "leaq (%3,%4,8),%3;"\ + "leaq (%10,%4,8),%10;"\ INNER_STORE_m1n8(%%zmm9,0) #define INNER_SAVE_m1n24 \ INNER_SAVE_m1n16\ - "leaq (%3,%4,8),%3;"\ + "leaq (%10,%4,8),%10;"\ INNER_STORE_m1n8(%%zmm10,0) #define INNER_SAVE_m2n8 \ + "movq %3,%10;"\ INNER_SETINDEX\ INNER_STORE_m1n8(%%zmm8,0)\ INNER_STORE_m1n8(%%zmm9,8) #define INNER_SAVE_m2n16 \ + "movq %3,%10;"\ INNER_SETINDEX\ INNER_STORE_m1n8(%%zmm8,0)\ INNER_STORE_m1n8(%%zmm10,8)\ - "leaq (%3,%4,8),%3;"\ + "leaq (%10,%4,8),%10;"\ INNER_STORE_m1n8(%%zmm9,0)\ INNER_STORE_m1n8(%%zmm11,8) + #define INNER_SAVE_m2n24 \ + "movq %3,%10;"\ INNER_SETINDEX\ INNER_STORE_m1n8(%%zmm8,0)\ INNER_STORE_m1n8(%%zmm11,8)\ - "leaq (%3,%4,8),%3;"\ + "leaq (%10,%4,8),%10;"\ INNER_STORE_m1n8(%%zmm9,0)\ INNER_STORE_m1n8(%%zmm12,8)\ - "leaq (%3,%4,8),%3;"\ + "leaq (%10,%4,8),%10;"\ INNER_STORE_m1n8(%%zmm10,0)\ INNER_STORE_m1n8(%%zmm13,8) -#define INNER_PREF_8x8 \ - "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\ - "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\ - "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\ - "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\ - "subq %4,%3; subq %4,%3; subq %4,%3;" + #define INNER_TRANS_4x8(c1,c2,c3,c4) \ "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ @@ -212,6 +213,7 @@ "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" + #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ INNER_TRANS_4x8(c1,c2,c3,c4)\ INNER_TRANS_4x8(c5,c6,c7,c8)\ @@ -223,64 +225,69 @@ "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" + //%7 for k01(input) only when m=4 #define INNER_STORE_4x8(c1,c2,c3,c4) \ - "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ - "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ - "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ - "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ - "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\ - "leaq (%3,%4,4),%3;" + "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ + "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ + "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ + "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ + "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "leaq (%10,%4,4),%10;" + #define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ - "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\ - "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\ - "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\ - "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;" + "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;" + #define INNER_SAVE_m4n8 \ + "movq %3,%10;"\ INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) + #define INNER_SAVE_m4n16 \ + "movq %3,%10;"\ INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) + #define INNER_SAVE_m4n24 \ + "movq %3,%10;"\ INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) + #define INNER_SAVE_m8n8 \ - INNER_PREF_8x8\ + "movq %3,%10;"\ INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) + #define INNER_SAVE_m8n16 \ - INNER_PREF_8x8\ + "movq %3,%10;"\ INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ - INNER_PREF_8x8\ INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) + #define INNER_SAVE_m8n24 \ - INNER_PREF_8x8\ + "movq %3,%10;"\ INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ - INNER_PREF_8x8\ INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ - INNER_PREF_8x8\ INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) #define COMPUTE_n8 {\ + b_pref = packed_b_pointer + 8 * K;\ __asm__ __volatile__(\ "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ @@ -290,7 +297,7 @@ INNER_KERNELm8(8)\ INNER_SAVE_m8n8\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\ + "addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ "42222:\n\t"\ "cmpq $4,%8; jb 42223f;"\ @@ -298,7 +305,7 @@ INNER_KERNELm4(8)\ INNER_SAVE_m4n8\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\ + "addq $32,%3;"\ "subq $4,%8;"\ "42223:\n\t"\ "cmpq $2,%8; jb 42224f;"\ @@ -318,11 +325,13 @@ "42225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ + b_pref = packed_b_pointer + 16 * K;\ __asm__ __volatile__(\ "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ @@ -332,7 +341,7 @@ INNER_KERNELm8(16)\ INNER_SAVE_m8n16\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ "32222:\n\t"\ "cmpq $4,%8; jb 32223f;"\ @@ -340,7 +349,7 @@ INNER_KERNELm4(16)\ INNER_SAVE_m4n16\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "addq $32,%3;"\ "subq $4,%8;"\ "32223:\n\t"\ "cmpq $2,%8; jb 32224f;"\ @@ -348,7 +357,7 @@ INNER_KERNELm2(16)\ INNER_SAVE_m2n16\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\ + "addq $16,%3;"\ "subq $2,%8;"\ "32224:\n\t"\ "cmpq $1,%8; jb 32225f;"\ @@ -356,17 +365,19 @@ INNER_KERNELm1(16)\ INNER_SAVE_m1n16\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\ + "addq $8,%3;"\ "32225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,4),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n24 {\ + b_pref = packed_b_pointer + 24 * K;\ __asm__ __volatile__(\ "vbroadcastsd (%9),%%zmm3;"\ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ @@ -376,7 +387,7 @@ INNER_KERNELm8(24)\ INNER_SAVE_m8n24\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\ + "addq $64,%3;"\ "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ "22222:\n\t"\ "cmpq $4,%8; jb 22223f;"\ @@ -384,7 +395,7 @@ INNER_KERNELm4(24)\ INNER_SAVE_m4n24\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\ + "addq $32,%3;"\ "subq $4,%8;"\ "22223:\n\t"\ "cmpq $2,%8; jb 22224f;"\ @@ -392,7 +403,7 @@ INNER_KERNELm2(24)\ INNER_SAVE_m2n24\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\ + "addq $16,%3;"\ "subq $2,%8;"\ "22224:\n\t"\ "cmpq $1,%8; jb 22225f;"\ @@ -400,12 +411,13 @@ INNER_KERNELm1(24)\ INNER_SAVE_m1n24\ "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\ + "addq $8,%3;"\ "22225:\n\t"\ "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ @@ -415,8 +427,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG if(k==0 || m==0 || ndiv8==0) return; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); int64_t K = (int64_t)k; int64_t M = (int64_t)m; - double *a_block_pointer; - double *c_pointer = c; + double *a_block_pointer,*b_pref; + double *c_pointer = c,*c_store = c; __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; BLASLONG ndiv8_count; double *packed_b_pointer = packed_b; From 819e852ae76f49931ddd0c242b8d5569729677f9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 11 Nov 2019 20:04:52 +0800 Subject: [PATCH 766/935] AVX512 CGEMM & ZGEMM kernels 96-99% 1-thread performance of MKL2018 --- kernel/x86_64/KERNEL.SKYLAKEX | 3 + kernel/x86_64/cgemm_kernel_8x2_skylakex.c | 352 ++++++++++++++++++++++ kernel/x86_64/zgemm_kernel_4x2_skylakex.c | 283 +++++++++++++++++ 3 files changed, 638 insertions(+) create mode 100644 kernel/x86_64/cgemm_kernel_8x2_skylakex.c create mode 100644 kernel/x86_64/zgemm_kernel_4x2_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index a39030c53..d5d32d1b3 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -14,3 +14,6 @@ DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c + +CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c +ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c diff --git a/kernel/x86_64/cgemm_kernel_8x2_skylakex.c b/kernel/x86_64/cgemm_kernel_8x2_skylakex.c new file mode 100644 index 000000000..35a57b98a --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_8x2_skylakex.c @@ -0,0 +1,352 @@ +#include +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define CGEMM_SKX_MODE 0 //not to do conjugation on a_block and b_block +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define CGEMM_SKX_MODE 1 //do conjugation on a_block, not b_block +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define CGEMM_SKX_MODE 2 //do conjugation on b_block, not a_block +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define CGEMM_SKX_MODE 3 //do conjugation on a_block and b_block +#endif + +// recommended settings: GEMM_DEFAULT_Q = 192, GEMM_DEFAULT_P = 384 +/* %0=a_pointer, %1=b_pointer, %2=c_pointer, %3=c_store, %4=ldc(bytes), %5=&constval, %6 = k_counter, %7 = m_counter, %8 = b_pref */ +// const float constval[4] = {alpha_r, alpha_i, -1, 1}; +/* r11 = m; r12 = k * 16; r13 = k; r14 = b_head; r15 = %1 + r12 * 3; */ +#define GENERAL_INIT "movq %7,%%r11; movq %1,%%r14; movq %6,%%r13; movq %6,%%r12; salq $4,%%r12;" +#define GENERAL_RECOVER "movq %%r11,%7; movq %%r13,%6; movq %%r14,%1;" +#define CONSTZMM_INIT "vbroadcastss (%5),%%zmm0; vbroadcastss 4(%5),%%zmm1; vbroadcastsd 8(%5),%%zmm2;" +#define COMPUTE_INIT "movq %%r13,%6; movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;" + +/* m=8, zmm0=alpha_r, zmm1=alpha_i, zmm2={-1,1,...,-1,1}, zmm3-zmm7 for temporary use, zmm8-zmm31 for accumulators */ +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block + #define unit_kernel_k1m8n1(a_r,a_i,b_off,c_le,c_ri,...) \ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231ps "#a_r",%%zmm3,"#c_le"; vfmadd231ps "#a_i",%%zmm3,"#c_ri";" +#else //do conjugation on a_block + #define unit_kernel_k1m8n1(a_r,a_i,b_off,c_le,c_ri,...) \ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231ps "#a_r",%%zmm3,"#c_le"; vfnmadd231ps "#a_i",%%zmm3,"#c_ri";" +#endif +#define KERNEL_h_k1m8n1 \ + "vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\ + unit_kernel_k1m8n1(%%zmm4,%%zmm5,0,%%zmm8,%%zmm9,%1) +#define KERNEL_t_k1m8n1 KERNEL_h_k1m8n1 "addq $8,%1;" +#define KERNEL_h_k1m8n2 KERNEL_h_k1m8n1 unit_kernel_k1m8n1(%%zmm4,%%zmm5,8,%%zmm10,%%zmm11,%1) +#define KERNEL_t_k1m8n2 KERNEL_h_k1m8n2 "addq $16,%1;" +#define unit_kernel_k1m8n2(c1le,c1ri,c2le,c2ri,...) \ + unit_kernel_k1m8n1(%%zmm4,%%zmm5,0,c1le,c1ri,__VA_ARGS__)\ + unit_kernel_k1m8n1(%%zmm4,%%zmm5,8,c2le,c2ri,__VA_ARGS__) +#define KERNEL_h_k1m8n4 KERNEL_h_k1m8n2 unit_kernel_k1m8n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1) +#define KERNEL_t_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define KERNEL_t_k1m8n6 KERNEL_h_k1m8n4 unit_kernel_k1m8n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m8n8 KERNEL_t_k1m8n6 unit_kernel_k1m8n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15) +#define KERNEL_t_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%%r15;" +#define KERNEL_h_k1m8n10 KERNEL_h_k1m8n8 unit_kernel_k1m8n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1) +#define KERNEL_t_k1m8n10 KERNEL_h_k1m8n10 "addq $16,%%r15;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n10 unit_kernel_k1m8n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2) +#define KERNEL_t_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;" +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 1 //not to do conjugation on b_block + #define unit_save_m8n1(c_le,c_ri,...) \ + "vpermilps $177,"#c_ri","#c_ri"; vfmadd231ps "#c_ri",%%zmm2,"#c_le"; vpermilps $177,"#c_le",%%zmm4;"\ + "vfmaddsub213ps ("#__VA_ARGS__"),%%zmm1,%%zmm4; vfmaddsub213ps %%zmm4,%%zmm0,"#c_le"; vmovups "#c_le",("#__VA_ARGS__");" +#else //do conjugation on b_block + #define unit_save_m8n1(c_le,c_ri,...) \ + "vpermilps $177,"#c_ri","#c_ri"; vfnmadd231ps "#c_ri",%%zmm2,"#c_le"; vpermilps $177,"#c_le",%%zmm4;"\ + "vfmsubadd213ps ("#__VA_ARGS__"),%%zmm0,"#c_le"; vfmsubadd231ps %%zmm4,%%zmm1,"#c_le"; vmovups "#c_le",("#__VA_ARGS__");" +#endif +#define SAVE_SETUP_m8 "movq %2,%3; addq $64,%2;" +#define SAVE_m8n1 SAVE_SETUP_m8 unit_save_m8n1(%%zmm8,%%zmm9,%3) +#define SAVE_m8n2 SAVE_m8n1 unit_save_m8n1(%%zmm10,%%zmm11,%3,%4,1) +#define unit_save_m8n2(c1le,c1ri,c2le,c2ri) \ + "leaq (%3,%4,2),%3;" unit_save_m8n1(c1le,c1ri,%3) unit_save_m8n1(c2le,c2ri,%3,%4,1) +#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15) +#define SAVE_m8n6 SAVE_m8n4 unit_save_m8n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19) +#define SAVE_m8n8 SAVE_m8n6 unit_save_m8n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23) +#define SAVE_m8n10 SAVE_m8n8 unit_save_m8n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27) +#define SAVE_m8n12 SAVE_m8n10 unit_save_m8n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31) +#define unit_init_m8n1(c_le,c_ri) "vpxorq "#c_le","#c_le","#c_le"; vpxorq "#c_ri","#c_ri","#c_ri";" +#define INIT_m8n1 unit_init_m8n1(%%zmm8,%%zmm9) +#define INIT_m8n2 INIT_m8n1 unit_init_m8n1(%%zmm10,%%zmm11) +#define INIT_m8n4 INIT_m8n2 unit_init_m8n1(%%zmm12,%%zmm13) unit_init_m8n1(%%zmm14,%%zmm15) +#define INIT_m8n6 INIT_m8n4 unit_init_m8n1(%%zmm16,%%zmm17) unit_init_m8n1(%%zmm18,%%zmm19) +#define INIT_m8n8 INIT_m8n6 unit_init_m8n1(%%zmm20,%%zmm21) unit_init_m8n1(%%zmm22,%%zmm23) +#define INIT_m8n10 INIT_m8n8 unit_init_m8n1(%%zmm24,%%zmm25) unit_init_m8n1(%%zmm26,%%zmm27) +#define INIT_m8n12 INIT_m8n10 unit_init_m8n1(%%zmm28,%%zmm29) unit_init_m8n1(%%zmm30,%%zmm31) +#define COMPUTE_m8(ndim) \ + INIT_m8n##ndim\ + COMPUTE_INIT "movq %2,%3;"\ + "cmpq $18,%6; jb "#ndim"88880f;"\ + #ndim"88889:\n\t"\ + KERNEL_t_k1m8n##ndim\ + KERNEL_t_k1m8n##ndim\ + KERNEL_t_k1m8n##ndim\ + "prefetcht1 (%3); prefetcht1 63(%3); addq %4,%3;"\ + KERNEL_t_k1m8n##ndim\ + KERNEL_t_k1m8n##ndim\ + KERNEL_t_k1m8n##ndim\ + "prefetcht1 (%8); addq $40,%8;"\ + "subq $6,%6; cmpq $18,%6; jnb "#ndim"88889b;"\ + "movq %2,%3;"\ + #ndim"88880:\n\t"\ + "testq %6,%6; jz "#ndim"88881f;"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_t_k1m8n##ndim\ + "decq %6; jmp "#ndim"88880b;"\ + #ndim"88881:\n\t"\ + SAVE_m8n##ndim + +/* m=4, ymm0-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilps($177,a0) + #define unit_kernel_k1m4n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231ps "#ap",%%ymm2,"#c1";"\ + "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231ps "#a0",%%ymm2,"#c1";" +#else //conjg_a != conjg_b + #define unit_kernel_k1m4n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231ps "#ap",%%ymm2,"#c1";"\ + "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231ps "#a0",%%ymm2,"#c1";" +#endif +#define KERNEL_h_k1m4n1 \ + "vmovups (%0),%%ymm0; vpermilps $177,%%ymm0,%%ymm1; addq $32,%0;"\ + unit_kernel_k1m4n1(%%ymm0,%%ymm1,0,4,%%ymm4,%1) +#define KERNEL_t_k1m4n1 KERNEL_h_k1m4n1 "addq $8,%1;" +#define KERNEL_h_k1m4n2 KERNEL_h_k1m4n1 unit_kernel_k1m4n1(%%ymm0,%%ymm1,8,12,%%ymm5,%1) +#define KERNEL_t_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" +#define unit_kernel_k1m4n2(c1,c2,...) \ + unit_kernel_k1m4n1(%%ymm0,%%ymm1,0,4,c1,__VA_ARGS__)\ + unit_kernel_k1m4n1(%%ymm0,%%ymm1,8,12,c2,__VA_ARGS__) +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_kernel_k1m4n2(%%ymm6,%%ymm7,%1,%%r12,1) +#define KERNEL_t_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define KERNEL_t_k1m4n6 KERNEL_h_k1m4n4 unit_kernel_k1m4n2(%%ymm8,%%ymm9,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m4n8 KERNEL_t_k1m4n6 unit_kernel_k1m4n2(%%ymm10,%%ymm11,%%r15) +#define KERNEL_t_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%%r15;" +#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_kernel_k1m4n2(%%ymm12,%%ymm13,%%r15,%%r12,1) +#define KERNEL_t_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_kernel_k1m4n2(%%ymm14,%%ymm15,%%r15,%%r12,2) +#define KERNEL_t_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block + #define unit_save_m4n1(alp_r,alp_i,c1,...) \ + "vpermilps $177,"#c1",%%ymm3; vfmaddsub213ps ("#__VA_ARGS__"),"#alp_i",%%ymm3;"\ + "vfmaddsub213ps %%ymm3,"#alp_r","#c1";vmovups "#c1",("#__VA_ARGS__");" +#else //do conjugation on a_block + #define unit_save_m4n1(alp_r,alp_i,c1,...) \ + "vpermilps $177,"#c1",%%ymm3; vfmsubadd213ps ("#__VA_ARGS__"),"#alp_r","#c1";"\ + "vfmsubadd231ps %%ymm3,"#alp_i","#c1";vmovups "#c1",("#__VA_ARGS__");" +#endif +#define SAVE_SETUP_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%5),%%ymm0; vbroadcastss 4(%5),%%ymm1;" +#define SAVE_m4n1 SAVE_SETUP_m4 unit_save_m4n1(%%ymm0,%%ymm1,%%ymm4,%3) +#define SAVE_m4n2 SAVE_m4n1 unit_save_m4n1(%%ymm0,%%ymm1,%%ymm5,%3,%4,1) +#define unit_save_m4n2(c1,c2) \ + "leaq (%3,%4,2),%3;" unit_save_m4n1(%%ymm0,%%ymm1,c1,%3) unit_save_m4n1(%%ymm0,%%ymm1,c2,%3,%4,1) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%ymm6,%%ymm7) +#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(%%ymm8,%%ymm9) +#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(%%ymm10,%%ymm11) +#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(%%ymm12,%%ymm13) +#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(%%ymm14,%%ymm15) +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define unit_init_m4n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";" +#define INIT_m4n2 unit_init_m4n2(%%ymm4,%%ymm5) +#define INIT_m4n4 INIT_m4n2 unit_init_m4n2(%%ymm6,%%ymm7) +#define INIT_m4n6 INIT_m4n4 unit_init_m4n2(%%ymm8,%%ymm9) +#define INIT_m4n8 INIT_m4n6 unit_init_m4n2(%%ymm10,%%ymm11) +#define INIT_m4n10 INIT_m4n8 unit_init_m4n2(%%ymm12,%%ymm13) +#define INIT_m4n12 INIT_m4n10 unit_init_m4n2(%%ymm14,%%ymm15) +#define COMPUTE_m4(ndim) \ + INIT_m4n##ndim\ + COMPUTE_INIT\ + #ndim"88440:\n\t"\ + "testq %6,%6; jz "#ndim"88441f;"\ + KERNEL_t_k1m4n##ndim\ + "decq %6; jmp "#ndim"88440b;"\ + #ndim"88441:\n\t"\ + SAVE_m4n##ndim + +/* m=2, xmm0-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b; + #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\ + "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";" +#else //conjg_a != conjg_b + #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\ + "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";" +#endif +#define KERNEL_h_k1m2n1 \ + "vmovups (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $16,%0;"\ + unit_kernel_k1m2n1(%%xmm0,%%xmm1,0,4,%%xmm4,%1) +#define KERNEL_t_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;" +#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1 unit_kernel_k1m2n1(%%xmm0,%%xmm1,8,12,%%xmm5,%1) +#define KERNEL_t_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" +#define unit_kernel_k1m2n2(c1,c2,...) \ + unit_kernel_k1m2n1(%%xmm0,%%xmm1,0,4,c1,__VA_ARGS__)\ + unit_kernel_k1m2n1(%%xmm0,%%xmm1,8,12,c2,__VA_ARGS__) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_kernel_k1m2n2(%%xmm6,%%xmm7,%1,%%r12,1) +#define KERNEL_t_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_t_k1m2n6 KERNEL_h_k1m2n4 unit_kernel_k1m2n2(%%xmm8,%%xmm9,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m2n8 KERNEL_t_k1m2n6 unit_kernel_k1m2n2(%%xmm10,%%xmm11,%%r15) +#define KERNEL_t_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%%r15;" +#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_kernel_k1m2n2(%%xmm12,%%xmm13,%%r15,%%r12,1) +#define KERNEL_t_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;" +#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_kernel_k1m2n2(%%xmm14,%%xmm15,%%r15,%%r12,2) +#define KERNEL_t_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block + #define unit_save_m2n1(alp_r,alp_i,c1,...) \ + "vpermilps $177,"#c1",%%xmm3; vfmaddsub213ps ("#__VA_ARGS__"),"#alp_i",%%xmm3;"\ + "vfmaddsub213ps %%xmm3,"#alp_r","#c1";vmovups "#c1",("#__VA_ARGS__");" +#else //do conjugation on a_block + #define unit_save_m2n1(alp_r,alp_i,c1,...) \ + "vpermilps $177,"#c1",%%xmm3; vfmsubadd213ps ("#__VA_ARGS__"),"#alp_r","#c1";"\ + "vfmsubadd231ps %%xmm3,"#alp_i","#c1";vmovups "#c1",("#__VA_ARGS__");" +#endif +#define SAVE_SETUP_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%5),%%xmm0; vbroadcastss 4(%5),%%xmm1;" +#define SAVE_m2n1 SAVE_SETUP_m2 unit_save_m2n1(%%xmm0,%%xmm1,%%xmm4,%3) +#define SAVE_m2n2 SAVE_m2n1 unit_save_m2n1(%%xmm0,%%xmm1,%%xmm5,%3,%4,1) +#define unit_save_m2n2(c1,c2) \ + "leaq (%3,%4,2),%3;" unit_save_m2n1(%%xmm0,%%xmm1,c1,%3) unit_save_m2n1(%%xmm0,%%xmm1,c2,%3,%4,1) +#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(%%xmm6,%%xmm7) +#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(%%xmm8,%%xmm9) +#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(%%xmm10,%%xmm11) +#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(%%xmm12,%%xmm13) +#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(%%xmm14,%%xmm15) +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define unit_init_m2n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";" +#define INIT_m2n2 unit_init_m2n2(%%xmm4,%%xmm5) +#define INIT_m2n4 INIT_m2n2 unit_init_m2n2(%%xmm6,%%xmm7) +#define INIT_m2n6 INIT_m2n4 unit_init_m2n2(%%xmm8,%%xmm9) +#define INIT_m2n8 INIT_m2n6 unit_init_m2n2(%%xmm10,%%xmm11) +#define INIT_m2n10 INIT_m2n8 unit_init_m2n2(%%xmm12,%%xmm13) +#define INIT_m2n12 INIT_m2n10 unit_init_m2n2(%%xmm14,%%xmm15) +#define COMPUTE_m2(ndim) \ + INIT_m2n##ndim\ + COMPUTE_INIT\ + #ndim"88220:\n\t"\ + "testq %6,%6; jz "#ndim"88221f;"\ + KERNEL_t_k1m2n##ndim\ + "decq %6; jmp "#ndim"88220b;"\ + #ndim"88221:\n\t"\ + SAVE_m2n##ndim + +/* m=1, xmm0-xmm3 and xmm10-xmm15 for temporary use, xmm4-xmm9 for accumulators */ +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilps($177,a0) + #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\ + "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";" + #define unit_kernel_k1m1n2(a0,ap,c1,...) \ + "vmovshdup ("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\ + "vmovsldup ("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";" +#else //conjg_a != conjg_b + #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\ + "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";" + #define unit_kernel_k1m1n2(a0,ap,c1,...) \ + "vmovshdup ("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\ + "vmovsldup ("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";" +#endif +#define KERNEL_h_k1m1n1 \ + "vmovsd (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $8,%0;"\ + unit_kernel_k1m1n1(%%xmm0,%%xmm1,0,4,%%xmm4,%1) +#define KERNEL_t_k1m1n1 KERNEL_h_k1m1n1 "addq $8,%1;" +#define KERNEL_h_k1m1n2 \ + "vmovddup (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $8,%0;"\ + unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm4,%1) +#define KERNEL_t_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm5,%1,%%r12,1) +#define KERNEL_t_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_t_k1m1n6 KERNEL_h_k1m1n4 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm6,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m1n8 KERNEL_t_k1m1n6 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm7,%%r15) +#define KERNEL_t_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%%r15;" +#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm8,%%r15,%%r12,1) +#define KERNEL_t_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;" +#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm9,%%r15,%%r12,2) +#define KERNEL_t_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block + #define unit_save_m1n1(alp_r,alp_i,c1,...) \ + "vpermilps $177,"#c1",%%xmm3; vmovsd ("#__VA_ARGS__"),%%xmm2; vfmaddsub213ps %%xmm2,"#alp_i",%%xmm3;"\ + "vfmaddsub213ps %%xmm3,"#alp_r","#c1";vmovsd "#c1",("#__VA_ARGS__");" + #define unit_save_m1n2(alp_r,alp_i,c1) \ + "vpermilps $177,"#c1",%%xmm3; vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2;"\ + "vfmaddsub213ps %%xmm2,"#alp_i",%%xmm3; vfmaddsub231ps "#c1","#alp_r",%%xmm3;"\ + "vmovsd %%xmm3,(%3); vmovhpd %%xmm3,(%3,%4,1); leaq (%3,%4,2),%3;" +#else //do conjugation on a_block + #define unit_save_m1n1(alp_r,alp_i,c1,...) \ + "vpermilps $177,"#c1",%%xmm3; vmovsd ("#__VA_ARGS__"),%%xmm2; vfmsubadd213ps %%xmm2,"#alp_r","#c1";"\ + "vfmsubadd231ps %%xmm3,"#alp_i","#c1";vmovsd "#c1",("#__VA_ARGS__");" + #define unit_save_m1n2(alp_r,alp_i,c1) \ + "vpermilps $177,"#c1",%%xmm3; vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2;"\ + "vfmsubadd213ps %%xmm2,"#alp_r","#c1"; vfmsubadd213ps "#c1","#alp_i",%%xmm3;"\ + "vmovsd %%xmm3,(%3); vmovhpd %%xmm3,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_SETUP_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%5),%%xmm0; vbroadcastss 4(%5),%%xmm1;" +#define SAVE_m1n1 SAVE_SETUP_m1 unit_save_m1n1(%%xmm0,%%xmm1,%%xmm4,%3) +#define SAVE_m1n2 SAVE_SETUP_m1 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm4) +#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm5) +#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm6) +#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm7) +#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm8) +#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm9) +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m1n2 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n6 INIT_m1n4 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define INIT_m1n8 INIT_m1n6 "vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m1n10 INIT_m1n8 "vpxor %%xmm8,%%xmm8,%%xmm8;" +#define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;" +#define COMPUTE_m1(ndim) \ + INIT_m1n##ndim\ + COMPUTE_INIT\ + #ndim"88110:\n\t"\ + "testq %6,%6; jz "#ndim"88111f;"\ + KERNEL_t_k1m1n##ndim\ + "decq %6; jmp "#ndim"88110b;"\ + #ndim"88111:\n\t"\ + SAVE_m1n##ndim + +#define COMPUTE(ndim) {\ + b_pref = b_pointer + ndim * K * 2;\ + __asm__ __volatile__(\ + GENERAL_INIT\ + CONSTZMM_INIT\ + "cmpq $8,%7;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%7;jb 33102"#ndim"f;"\ + COMPUTE_m4(ndim)\ + "subq $4,%7;"\ + "33102"#ndim":\n\t"\ + "cmpq $2,%7;jb 33103"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%7;"\ + "33103"#ndim":\n\t"\ + "testq %7,%7;jz 33104"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33104"#ndim":\n\t"\ + GENERAL_RECOVER\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(c_store),"+r"(ldc_in_bytes),"+r"(constval),"+r"(K),"+r"(M),"+r"(b_pref)\ + ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ + "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ + "cc","memory");\ + a_pointer -= M * K * 2; b_pointer += ndim * K * 2; c_pointer += (LDC * ndim - M) * 2;\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; float const_val[4] = {alphar, alphai, -1, 1}; + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*c_store = C,*constval = const_val,*b_pref = B; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>9;n_count-=10) COMPUTE(10) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} diff --git a/kernel/x86_64/zgemm_kernel_4x2_skylakex.c b/kernel/x86_64/zgemm_kernel_4x2_skylakex.c new file mode 100644 index 000000000..0606a3f7c --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_skylakex.c @@ -0,0 +1,283 @@ +#include "common.h" +#include + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define ZGEMM_SKX_MODE 0 //not to do conjugation on a_block and b_block +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define ZGEMM_SKX_MODE 1 //do conjugation on a_block, not b_block +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define ZGEMM_SKX_MODE 2 //do conjugation on b_block, not a_block +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define ZGEMM_SKX_MODE 3 //do conjugation on a_block and b_block +#endif + +// recommended settings: GEMM_DEFAULT_Q = 128, GEMM_DEFAULT_P = 256 +/* %0=a_pointer, %1=b_pointer, %2=c_pointer, %3=c_store, %4=ldc(bytes), %5=&constval, %6 = k_counter, %7 = m_counter, %8 = b_pref */ +// const double constval[4] = {alpha_r, alpha_i, -1, 1}; +/* r11 = m; r12 = k * 32; r13 = k; r14 = b_head; r15 = %1 + r12 * 3; */ +#define GENERAL_INIT "movq %7,%%r11; movq %1,%%r14; movq %6,%%r13; movq %6,%%r12; salq $5,%%r12;" +#define GENERAL_RECOVER "movq %%r11,%7; movq %%r13,%6; movq %%r14,%1;" +#define CONSTZMM_INIT "vbroadcastsd (%5),%%zmm0; vbroadcastsd 8(%5),%%zmm1; vbroadcastf32x4 16(%5),%%zmm2;" +#define COMPUTE_INIT "movq %%r13,%6; movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;" + +/* m=4, zmm0=alpha_r, zmm1=alpha_i, zmm2={-1,1,...,-1,1}, zmm3-zmm7 for temporary use, zmm8-zmm31 for accumulators */ +#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block + #define unit_kernel_k1m4n1(a_r,a_i,b_off,c_le,c_ri,...) \ + "vbroadcastf32x4 "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231pd "#a_r",%%zmm3,"#c_le"; vfmadd231pd "#a_i",%%zmm3,"#c_ri";" +#else //do conjugation on a_block + #define unit_kernel_k1m4n1(a_r,a_i,b_off,c_le,c_ri,...) \ + "vbroadcastf32x4 "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231pd "#a_r",%%zmm3,"#c_le"; vfnmadd231pd "#a_i",%%zmm3,"#c_ri";" +#endif +#define KERNEL_h_k1m4n1 \ + "vmovddup (%0),%%zmm4; vmovddup 8(%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\ + unit_kernel_k1m4n1(%%zmm4,%%zmm5,0,%%zmm8,%%zmm9,%1) +#define KERNEL_t_k1m4n1 KERNEL_h_k1m4n1 "addq $16,%1;" +#define KERNEL_h_k1m4n2 KERNEL_h_k1m4n1 unit_kernel_k1m4n1(%%zmm4,%%zmm5,16,%%zmm10,%%zmm11,%1) +#define KERNEL_t_k1m4n2 KERNEL_h_k1m4n2 "addq $32,%1;" +#define unit_kernel_k1m4n2(c1le,c1ri,c2le,c2ri,...) \ + unit_kernel_k1m4n1(%%zmm4,%%zmm5,0,c1le,c1ri,__VA_ARGS__)\ + unit_kernel_k1m4n1(%%zmm4,%%zmm5,16,c2le,c2ri,__VA_ARGS__) +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_kernel_k1m4n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1) +#define KERNEL_t_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;" +#define KERNEL_t_k1m4n6 KERNEL_h_k1m4n4 unit_kernel_k1m4n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) "addq $32,%1;" +#define KERNEL_h_k1m4n8 KERNEL_t_k1m4n6 unit_kernel_k1m4n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15) +#define KERNEL_t_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%%r15;" +#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_kernel_k1m4n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1) +#define KERNEL_t_k1m4n10 KERNEL_h_k1m4n10 "addq $32,%%r15;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_kernel_k1m4n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2) +#define KERNEL_t_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%%r15;" +#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 1 //not to do conjugation on b_block + #define unit_save_m4n1(c_le,c_ri,...) \ + "vpermilpd $85,"#c_ri","#c_ri"; vfmadd231pd "#c_ri",%%zmm2,"#c_le"; vpermilpd $85,"#c_le",%%zmm4;"\ + "vfmaddsub213pd ("#__VA_ARGS__"),%%zmm1,%%zmm4; vfmaddsub213pd %%zmm4,%%zmm0,"#c_le"; vmovupd "#c_le",("#__VA_ARGS__");" +#else //do conjugation on b_block + #define unit_save_m4n1(c_le,c_ri,...) \ + "vpermilpd $85,"#c_ri","#c_ri"; vfnmadd231pd "#c_ri",%%zmm2,"#c_le"; vpermilpd $85,"#c_le",%%zmm4;"\ + "vfmsubadd213pd ("#__VA_ARGS__"),%%zmm0,"#c_le"; vfmsubadd231pd %%zmm4,%%zmm1,"#c_le"; vmovupd "#c_le",("#__VA_ARGS__");" +#endif +#define SAVE_SETUP_m4 "movq %2,%3; addq $64,%2;" +#define SAVE_m4n1 SAVE_SETUP_m4 unit_save_m4n1(%%zmm8,%%zmm9,%3) +#define SAVE_m4n2 SAVE_m4n1 unit_save_m4n1(%%zmm10,%%zmm11,%3,%4,1) +#define unit_save_m4n2(c1le,c1ri,c2le,c2ri) \ + "leaq (%3,%4,2),%3;" unit_save_m4n1(c1le,c1ri,%3) unit_save_m4n1(c2le,c2ri,%3,%4,1) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15) +#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19) +#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23) +#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27) +#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31) +#define unit_init_m4n1(c_le,c_ri) "vpxorq "#c_le","#c_le","#c_le"; vpxorq "#c_ri","#c_ri","#c_ri";" +#define INIT_m4n1 unit_init_m4n1(%%zmm8,%%zmm9) +#define INIT_m4n2 INIT_m4n1 unit_init_m4n1(%%zmm10,%%zmm11) +#define INIT_m4n4 INIT_m4n2 unit_init_m4n1(%%zmm12,%%zmm13) unit_init_m4n1(%%zmm14,%%zmm15) +#define INIT_m4n6 INIT_m4n4 unit_init_m4n1(%%zmm16,%%zmm17) unit_init_m4n1(%%zmm18,%%zmm19) +#define INIT_m4n8 INIT_m4n6 unit_init_m4n1(%%zmm20,%%zmm21) unit_init_m4n1(%%zmm22,%%zmm23) +#define INIT_m4n10 INIT_m4n8 unit_init_m4n1(%%zmm24,%%zmm25) unit_init_m4n1(%%zmm26,%%zmm27) +#define INIT_m4n12 INIT_m4n10 unit_init_m4n1(%%zmm28,%%zmm29) unit_init_m4n1(%%zmm30,%%zmm31) +#define COMPUTE_m4(ndim) \ + INIT_m4n##ndim\ + COMPUTE_INIT "movq %2,%3;"\ + "cmpq $20,%6; jb "#ndim"88440f;"\ + #ndim"88449:\n\t"\ + KERNEL_t_k1m4n##ndim\ + KERNEL_t_k1m4n##ndim\ + KERNEL_t_k1m4n##ndim\ + "prefetcht1 (%3); prefetcht1 63(%3); addq %4,%3;"\ + KERNEL_t_k1m4n##ndim\ + KERNEL_t_k1m4n##ndim\ + KERNEL_t_k1m4n##ndim\ + "prefetcht1 (%8); addq $24,%8;"\ + "subq $6,%6; cmpq $20,%6; jnb "#ndim"88449b;"\ + "movq %2,%3;"\ + #ndim"88440:\n\t"\ + "testq %6,%6; jz "#ndim"88441f;"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_t_k1m4n##ndim\ + "decq %6; jmp "#ndim"88440b;"\ + #ndim"88441:\n\t"\ + SAVE_m4n##ndim + +/* m=2, ymm0-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilpd($5,a0) + #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastsd "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#ap",%%ymm2,"#c1";"\ + "vbroadcastsd "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#a0",%%ymm2,"#c1";" +#else //conjg_a != conjg_b + #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vbroadcastsd "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#ap",%%ymm2,"#c1";"\ + "vbroadcastsd "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#a0",%%ymm2,"#c1";" +#endif +#define KERNEL_h_k1m2n1 \ + "vmovupd (%0),%%ymm0; vpermilpd $5,%%ymm0,%%ymm1; addq $32,%0;"\ + unit_kernel_k1m2n1(%%ymm0,%%ymm1,0,8,%%ymm4,%1) +#define KERNEL_t_k1m2n1 KERNEL_h_k1m2n1 "addq $16,%1;" +#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1 unit_kernel_k1m2n1(%%ymm0,%%ymm1,16,24,%%ymm5,%1) +#define KERNEL_t_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;" +#define unit_kernel_k1m2n2(c1,c2,...) \ + unit_kernel_k1m2n1(%%ymm0,%%ymm1,0,8,c1,__VA_ARGS__)\ + unit_kernel_k1m2n1(%%ymm0,%%ymm1,16,24,c2,__VA_ARGS__) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_kernel_k1m2n2(%%ymm6,%%ymm7,%1,%%r12,1) +#define KERNEL_t_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;" +#define KERNEL_t_k1m2n6 KERNEL_h_k1m2n4 unit_kernel_k1m2n2(%%ymm8,%%ymm9,%1,%%r12,2) "addq $32,%1;" +#define KERNEL_h_k1m2n8 KERNEL_t_k1m2n6 unit_kernel_k1m2n2(%%ymm10,%%ymm11,%%r15) +#define KERNEL_t_k1m2n8 KERNEL_h_k1m2n8 "addq $32,%%r15;" +#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_kernel_k1m2n2(%%ymm12,%%ymm13,%%r15,%%r12,1) +#define KERNEL_t_k1m2n10 KERNEL_h_k1m2n10 "addq $32,%%r15;" +#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_kernel_k1m2n2(%%ymm14,%%ymm15,%%r15,%%r12,2) +#define KERNEL_t_k1m2n12 KERNEL_h_k1m2n12 "addq $32,%%r15;" +#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block + #define unit_save_m2n1(alp_r,alp_i,c1,...) \ + "vpermilpd $5,"#c1",%%ymm3; vfmaddsub213pd ("#__VA_ARGS__"),"#alp_i",%%ymm3;"\ + "vfmaddsub213pd %%ymm3,"#alp_r","#c1";vmovupd "#c1",("#__VA_ARGS__");" +#else //do conjugation on a_block + #define unit_save_m2n1(alp_r,alp_i,c1,...) \ + "vpermilpd $5,"#c1",%%ymm3; vfmsubadd213pd ("#__VA_ARGS__"),"#alp_r","#c1";"\ + "vfmsubadd231pd %%ymm3,"#alp_i","#c1";vmovupd "#c1",("#__VA_ARGS__");" +#endif +#define SAVE_SETUP_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%5),%%ymm0; vbroadcastsd 8(%5),%%ymm1;" +#define SAVE_m2n1 SAVE_SETUP_m2 unit_save_m2n1(%%ymm0,%%ymm1,%%ymm4,%3) +#define SAVE_m2n2 SAVE_m2n1 unit_save_m2n1(%%ymm0,%%ymm1,%%ymm5,%3,%4,1) +#define unit_save_m2n2(c1,c2) \ + "leaq (%3,%4,2),%3;" unit_save_m2n1(%%ymm0,%%ymm1,c1,%3) unit_save_m2n1(%%ymm0,%%ymm1,c2,%3,%4,1) +#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(%%ymm6,%%ymm7) +#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(%%ymm8,%%ymm9) +#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(%%ymm10,%%ymm11) +#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(%%ymm12,%%ymm13) +#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(%%ymm14,%%ymm15) +#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define unit_init_m2n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";" +#define INIT_m2n2 unit_init_m2n2(%%ymm4,%%ymm5) +#define INIT_m2n4 INIT_m2n2 unit_init_m2n2(%%ymm6,%%ymm7) +#define INIT_m2n6 INIT_m2n4 unit_init_m2n2(%%ymm8,%%ymm9) +#define INIT_m2n8 INIT_m2n6 unit_init_m2n2(%%ymm10,%%ymm11) +#define INIT_m2n10 INIT_m2n8 unit_init_m2n2(%%ymm12,%%ymm13) +#define INIT_m2n12 INIT_m2n10 unit_init_m2n2(%%ymm14,%%ymm15) +#define COMPUTE_m2(ndim) \ + INIT_m2n##ndim\ + COMPUTE_INIT\ + #ndim"88220:\n\t"\ + "testq %6,%6; jz "#ndim"88221f;"\ + KERNEL_t_k1m2n##ndim\ + "decq %6; jmp "#ndim"88220b;"\ + #ndim"88221:\n\t"\ + SAVE_m2n##ndim + +/* m=1, ymm0-ymm3 and ymm10-ymm15 for temporary use, ymm4-ymm9 for accumulators */ +#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilpd($5,a0) + #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vmovddup "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231pd "#ap",%%xmm2,"#c1";"\ + "vmovddup "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231pd "#a0",%%xmm2,"#c1";" + #define unit_kernel_k1m1n2(a0,ap,b_off_r,b_off_i,c1,...) \ + "vmovddup "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#ap",%%ymm2,"#c1";"\ + "vmovddup "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#a0",%%ymm2,"#c1";" +#else //conjg_a != conjg_b + #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \ + "vmovddup "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231pd "#ap",%%xmm2,"#c1";"\ + "vmovddup "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231pd "#a0",%%xmm2,"#c1";" + #define unit_kernel_k1m1n2(a0,ap,b_off_r,b_off_i,c1,...) \ + "vmovddup "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#ap",%%ymm2,"#c1";"\ + "vmovddup "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#a0",%%ymm2,"#c1";" +#endif +#define KERNEL_h_k1m1n1 \ + "vmovupd (%0),%%xmm0; vpermilpd $5,%%xmm0,%%xmm1; addq $16,%0;"\ + unit_kernel_k1m1n1(%%xmm0,%%xmm1,0,8,%%xmm4,%1) +#define KERNEL_t_k1m1n1 KERNEL_h_k1m1n1 "addq $16,%1;" +#define KERNEL_h_k1m1n2 \ + "vbroadcastf128 (%0),%%ymm0; vpermilpd $5,%%ymm0,%%ymm1; addq $16,%0;"\ + unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm4,%1) +#define KERNEL_t_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;" +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm5,%1,%%r12,1) +#define KERNEL_t_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;" +#define KERNEL_t_k1m1n6 KERNEL_h_k1m1n4 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm6,%1,%%r12,2) "addq $32,%1;" +#define KERNEL_h_k1m1n8 KERNEL_t_k1m1n6 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm7,%%r15) +#define KERNEL_t_k1m1n8 KERNEL_h_k1m1n8 "addq $32,%%r15;" +#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm8,%%r15,%%r12,1) +#define KERNEL_t_k1m1n10 KERNEL_h_k1m1n10 "addq $32,%%r15;" +#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm9,%%r15,%%r12,2) +#define KERNEL_t_k1m1n12 KERNEL_h_k1m1n12 "addq $32,%%r15;" +#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block + #define unit_save_m1n1(alp_r,alp_i,c1,...) \ + "vpermilpd $5,"#c1",%%xmm3; vfmaddsub213pd ("#__VA_ARGS__"),"#alp_i",%%xmm3;"\ + "vfmaddsub213pd %%xmm3,"#alp_r","#c1";vmovupd "#c1",("#__VA_ARGS__");" + #define unit_save_m1n2(alp_r,alp_i,c1) \ + "vpermilpd $5,"#c1",%%ymm3; vmovupd (%3),%%xmm2; vinsertf128 $1,(%3,%4,1),%%ymm2,%%ymm2;"\ + "vfmaddsub213pd %%ymm2,"#alp_i",%%ymm3; vfmaddsub231pd "#c1","#alp_r",%%ymm3;"\ + "vmovupd %%xmm3,(%3); vextractf128 $1,%%ymm3,(%3,%4,1); leaq (%3,%4,2),%3;" +#else //do conjugation on a_block + #define unit_save_m1n1(alp_r,alp_i,c1,...) \ + "vpermilpd $5,"#c1",%%xmm3; vfmsubadd213pd ("#__VA_ARGS__"),"#alp_r","#c1";"\ + "vfmsubadd231pd %%xmm3,"#alp_i","#c1";vmovupd "#c1",("#__VA_ARGS__");" + #define unit_save_m1n2(alp_r,alp_i,c1) \ + "vpermilpd $5,"#c1",%%ymm3; vmovupd (%3),%%xmm2; vinsertf128 $1,(%3,%4,1),%%ymm2,%%ymm2;"\ + "vfmsubadd213pd %%ymm2,"#alp_r","#c1"; vfmsubadd213pd "#c1","#alp_i",%%ymm3;"\ + "vmovupd %%xmm3,(%3); vextractf128 $1,%%ymm3,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_SETUP_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%5),%%ymm0; vbroadcastsd 8(%5),%%ymm1;" +#define SAVE_m1n1 SAVE_SETUP_m1 unit_save_m1n1(%%xmm0,%%xmm1,%%xmm4,%3) +#define SAVE_m1n2 SAVE_SETUP_m1 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm4) +#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm5) +#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm6) +#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm7) +#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm8) +#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm9) +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm6,%%ymm6,%%ymm6;" +#define INIT_m1n8 INIT_m1n6 "vpxor %%ymm7,%%ymm7,%%ymm7;" +#define INIT_m1n10 INIT_m1n8 "vpxor %%ymm8,%%ymm8,%%ymm8;" +#define INIT_m1n12 INIT_m1n10 "vpxor %%ymm9,%%ymm9,%%ymm9;" +#define COMPUTE_m1(ndim) \ + INIT_m1n##ndim\ + COMPUTE_INIT\ + #ndim"88110:\n\t"\ + "testq %6,%6; jz "#ndim"88111f;"\ + KERNEL_t_k1m1n##ndim\ + "decq %6; jmp "#ndim"88110b;"\ + #ndim"88111:\n\t"\ + SAVE_m1n##ndim + +#define COMPUTE(ndim) {\ + b_pref = b_pointer + ndim * K * 2;\ + __asm__ __volatile__(\ + GENERAL_INIT\ + CONSTZMM_INIT\ + "cmpq $4,%7;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m4(ndim)\ + "subq $4,%7;cmpq $4,%7;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $2,%7;jb 33102"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%7;"\ + "33102"#ndim":\n\t"\ + "testq %7,%7;jz 33103"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33103"#ndim":\n\t"\ + GENERAL_RECOVER\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(c_store),"+r"(ldc_in_bytes),"+r"(constval),"+r"(K),"+r"(M),"+r"(b_pref)\ + ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ + "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ + "cc","memory");\ + a_pointer -= M * K * 2; b_pointer += ndim * K * 2; c_pointer += (LDC * ndim - M) * 2;\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; double const_val[4] = {alphar, alphai, -1, 1}; + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + double *a_pointer = A,*b_pointer = B,*c_pointer = C,*c_store = C,*constval = const_val,*b_pref = B; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>9;n_count-=10) COMPUTE(10) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From bf73aa141b334ce2d870fe1b7ab340e0c7df5e8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Nov 2019 00:19:24 +0100 Subject: [PATCH 767/935] Fix potential spurious failure from uninitialized variable --- ctest/c_cblat3.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f index 96f190352..74293ce53 100644 --- a/ctest/c_cblat3.f +++ b/ctest/c_cblat3.f @@ -1503,6 +1503,8 @@ C $ ' .' ) NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) From 351d12b94e2213b6d819fd1cb5c28f64f7deafc9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Nov 2019 00:20:36 +0100 Subject: [PATCH 768/935] Fix potential spurious failure from uninitialized variable --- ctest/c_zblat3.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f index 5df834b2e..cc109d651 100644 --- a/ctest/c_zblat3.f +++ b/ctest/c_zblat3.f @@ -1504,6 +1504,8 @@ C $ ' .' ) NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) From 6082e556cd990fc4d13e89d83db403b79d771e52 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2019 15:10:26 +0100 Subject: [PATCH 769/935] Use "generic" S/CGEMM unroll M on big-endian PPC970 as the respective PPC970 "altivec" kernels give wrong results when compiled for big endian --- param.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/param.h b/param.h index 1cf4137d6..9dc94c420 100644 --- a/param.h +++ b/param.h @@ -1990,11 +1990,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 3072 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define SGEMM_DEFAULT_UNROLL_M 4 +#else #define SGEMM_DEFAULT_UNROLL_M 16 +#endif #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 +#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define CGEMM_DEFAULT_UNROLL_M 2 +#else #define CGEMM_DEFAULT_UNROLL_M 8 +#endif #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 From b3ac6ee2227c1c62c6f0b93d8e9985423fabdc9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2019 15:19:39 +0100 Subject: [PATCH 770/935] Define alternate kernels for big-endian PPC970 The altivec versions of SGEMM and CGEMM fail most test in LAPACK-TESTING when compiled for big endian, STRSM/CTRSM even cause segfaults. The rot kernels either fail the corresponding utest or lead to failures in LAPACK-TESTING. --- kernel/power/KERNEL.PPC970 | 55 +++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index 7431a7788..de30977de 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,3 +1,14 @@ +ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +else SGEMMKERNEL = gemm_kernel_altivec.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c @@ -7,6 +18,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + DGEMMKERNEL = gemm_kernel.S DGEMMINCOPY = DGEMMITCOPY = @@ -16,6 +29,18 @@ DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +else CGEMMKERNEL = zgemm_kernel_altivec.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c @@ -25,6 +50,8 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + ZGEMMKERNEL = zgemm_kernel.S ZGEMMINCOPY = ZGEMMITCOPY = @@ -35,22 +62,30 @@ ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -#STRSMKERNEL_LN = trsm_kernel_LN.S -#STRSMKERNEL_LT = trsm_kernel_LT.S -#STRSMKERNEL_RN = trsm_kernel_LT.S -#STRSMKERNEL_RT = trsm_kernel_RT.S - DTRSMKERNEL_LN = trsm_kernel_LN.S DTRSMKERNEL_LT = trsm_kernel_LT.S DTRSMKERNEL_RN = trsm_kernel_LT.S DTRSMKERNEL_RT = trsm_kernel_RT.S -#CTRSMKERNEL_LN = ztrsm_kernel_LN.S -#CTRSMKERNEL_LT = ztrsm_kernel_LT.S -#CTRSMKERNEL_RN = ztrsm_kernel_LT.S -#CTRSMKERNEL_RT = ztrsm_kernel_RT.S - ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c +endif From 82b75f97e509aaef4bc5d95c4b54b29b77c50ede Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2019 19:22:04 +0100 Subject: [PATCH 771/935] Disable the old QCDOC qalloc by default and copy utility functions from memory.c 1. qalloc() appears to have been a special routine written for the PPC440-based QCDOC supercomputer(s) from around 2005, its source does not seem to be readily available. So switch the #if 1 in the code to rely on standard malloc() by default. 2. Utility functions like get_num_procs, get_num_threads that were added to the "normally" used memory.c in the meantime were still missing here. --- driver/others/memory_qalloc.c | 321 ++++++++++++++++++++++++++++++++-- 1 file changed, 311 insertions(+), 10 deletions(-) diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 17b7f5d60..6174d9b75 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -38,21 +38,29 @@ #include #include "common.h" +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#include +#include +#endif -#ifndef SMP -#define blas_cpu_number 1 -#else - -int blas_cpu_number = 1; - -int blas_get_cpu_number(void){ +#ifdef OS_HAIKU +#include +#endif - return blas_cpu_number; -} +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) +#include +#include #endif + #define FIXED_PAGESIZE 4096 + void *sa = NULL; void *sb = NULL; static double static_buffer[BUFFER_SIZE/sizeof(double)]; @@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)]; void *blas_memory_alloc(int numproc){ if (sa == NULL){ -#if 1 +#if 0 sa = (void *)qalloc(QFAST, BUFFER_SIZE); #else sa = (void *)malloc(BUFFER_SIZE); @@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){ return; } + + +extern void openblas_warning(int verbose, const char * msg); + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#if defined(OS_LINUX) || defined(OS_SUNOS) +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + + static int nums = 0; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif + + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); +#if !defined(OS_LINUX) + return nums; +#endif + +/* +#if !defined(__GLIBC_PREREQ) + return nums; +#else + #if !__GLIBC_PREREQ(2, 3) + return nums; + #endif + + #if !__GLIBC_PREREQ(2, 7) + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); + if (ret!=0) return nums; + n=0; + #if !__GLIBC_PREREQ(2, 6) + for (i=0;i= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } + #endif +#endif +*/ + return 1; +} +#endif +#endif + +#ifdef OS_ANDROID +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_WINDOWS + +int get_num_procs(void) { + + static int nums = 0; + + if (nums == 0) { + + SYSTEM_INFO sysinfo; + + GetSystemInfo(&sysinfo); + + nums = sysinfo.dwNumberOfProcessors; + } + + return nums; +} + +#endif + +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) + +int get_num_procs(void) { + + static int nums = 0; + + int m[2]; + size_t len; + + if (nums == 0) { + m[0] = CTL_HW; + m[1] = HW_NCPU; + len = sizeof(int); + sysctl(m, 2, &nums, &len, NULL, 0); + } + + return nums; +} + +#endif + +#if defined(OS_DARWIN) +int get_num_procs(void) { + static int nums = 0; + size_t len; + if (nums == 0){ + len = sizeof(int); + sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0); + } + return nums; +} +/* +void set_stack_limit(int limitMB){ + int result=0; + struct rlimit rl; + rlim_t StackSize; + + StackSize=limitMB*1024*1024; + result=getrlimit(RLIMIT_STACK, &rl); + if(result==0){ + if(rl.rlim_cur < StackSize){ + rl.rlim_cur=StackSize; + result=setrlimit(RLIMIT_STACK, &rl); + if(result !=0){ + fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result); + } + } + } +} +*/ +#endif + + +/* +OpenBLAS uses the numbers of CPU cores in multithreading. +It can be set by openblas_set_num_threads(int num_threads); +*/ +int blas_cpu_number = 0; +/* +The numbers of threads in the thread pool. +This value is equal or large than blas_cpu_number. This means some threads are sleep. +*/ +int blas_num_threads = 0; + +int goto_get_num_procs (void) { + return blas_cpu_number; +} + +void openblas_fork_handler() +{ + // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is + // built with "make USE_OPENMP=0". + // Hanging can still happen when OpenBLAS is built against the libgomp + // implementation of OpenMP. The problem is tracked at: + // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 + // In the mean time build with USE_OPENMP=0 or link against another + // implementation of OpenMP. +#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) + int err; + err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); + if(err != 0) + openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); +#endif +} + +extern int openblas_num_threads_env(); +extern int openblas_goto_num_threads_env(); +extern int openblas_omp_num_threads_env(); + +int blas_get_cpu_number(void){ +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) + int max_num; +#endif + int blas_goto_num = 0; + int blas_omp_num = 0; + + if (blas_num_threads) return blas_num_threads; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) + max_num = get_num_procs(); +#endif + + // blas_goto_num = 0; +#ifndef USE_OPENMP + blas_goto_num=openblas_num_threads_env(); + if (blas_goto_num < 0) blas_goto_num = 0; + + if (blas_goto_num == 0) { + blas_goto_num=openblas_goto_num_threads_env(); + if (blas_goto_num < 0) blas_goto_num = 0; + } + +#endif + + // blas_omp_num = 0; + blas_omp_num=openblas_omp_num_threads_env(); + if (blas_omp_num < 0) blas_omp_num = 0; + + if (blas_goto_num > 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + +int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else + // init blas_cpu_number if needed + blas_get_cpu_number(); + return blas_cpu_number; +#endif +} From 0c07c356c1fd402d6466a79742df9cdbd3f5d62a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2019 19:25:08 +0100 Subject: [PATCH 772/935] Define alternate kernels for big-endian PPC440 --- kernel/power/KERNEL.PPC440 | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 988a4b701..a0696b548 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c +else CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif ISAMAXKERNEL = iamax_ppc440.S IDAMAXKERNEL = iamax_ppc440.S @@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S +else +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c +endif SSCALKERNEL = scal_ppc440.S DSCALKERNEL = scal_ppc440.S @@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S + +ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c +endif + From eba0aeb7cde7f59bca1e631512128f3e380588fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2019 22:58:32 +0100 Subject: [PATCH 773/935] Fix compilation for big-endian POWER8 --- kernel/power/caxpy_power8.S | 5 ++++- kernel/power/icamin_power8.S | 5 ++++- kernel/power/idamax.c | 6 +++--- kernel/power/idamin.c | 3 +-- kernel/power/isamax_power8.S | 5 ++++- kernel/power/isamin_power8.S | 5 ++++- kernel/power/izamin.c | 4 ++-- 7 files changed, 22 insertions(+), 11 deletions(-) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index 0ce61ca3b..b5f841d2e 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -12,11 +12,12 @@ PROLOGUE -caxpy_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry caxpy_k,.-caxpy_k +#endif mr. 7,3 ble 0,.L33 cmpdi 7,9,1 @@ -515,7 +516,9 @@ caxpy_k: b .L13 .long 0 .byte 0,0,0,0,0,4,0,0 +#if _CALL_ELF ==2 .size caxpy_k,.-caxpy_k +#endif .section .rodata .align 4 .set .LANCHOR0,. + 0 diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S index e3d66798e..f2993e83e 100644 --- a/kernel/power/icamin_power8.S +++ b/kernel/power/icamin_power8.S @@ -11,11 +11,12 @@ PROLOGUE -icamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry icamin_k,.-icamin_k +#endif mr. 9,3 ble 0,.L25 cmpdi 7,5,0 @@ -388,7 +389,9 @@ icamin_k: b .L21 .long 0 .byte 0,0,0,0,0,1,0,0 +#if _CALL_ELF ==2 .size icamin_k,.-icamin_k +#endif .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC2: diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 337fa54f8..95aa592c7 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -324,15 +324,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { -#if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -32; - if (n1 > 0) { +#if defined(_CALL_ELF) && (_CALL_ELF == 2) + if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); i = n1; } -#endif +#endif while (i < n) { if (ABS(x[i]) > maxf) { max = i; diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 85dd49ac1..323f9987e 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -328,13 +328,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n1 > 0) { min = diamin_kernel_32(n1, x, &minf); i = n1; } #endif - while (i < n) { if (ABS(x[i]) < minf) { min = i; diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S index c8fcaecc3..fa5433333 100644 --- a/kernel/power/isamax_power8.S +++ b/kernel/power/isamax_power8.S @@ -12,11 +12,12 @@ PROLOGUE -isamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry isamax_k,.-isamax_k +#endif mr. 11,3 ble 0,.L36 cmpdi 7,5,0 @@ -397,7 +398,9 @@ isamax_k: b .L61 .long 0 .byte 0,0,0,0,0,1,0,0 +#if _CALL_ELF ==2 .size isamax_k,.-isamax_k +#endif .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC2: diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S index 3873e879b..c9b6acb85 100644 --- a/kernel/power/isamin_power8.S +++ b/kernel/power/isamin_power8.S @@ -11,11 +11,12 @@ PROLOGUE -isamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry isamin_k,.-isamin_k +#endif mr. 11,3 ble 0,.L36 cmpdi 7,5,0 @@ -380,7 +381,9 @@ isamin_k: b .L35 .long 0 .byte 0,0,0,0,0,1,0,0 +#if _CALL_ELF ==2 .size isamin_k,.-isamin_k +#endif .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC2: diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 8da2189c6..06a5537d8 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -316,14 +316,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) minf = CABS1(x,0); //index will not be incremented #if defined(_CALL_ELF) && (_CALL_ELF == 2) - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; if (n1 > 0) { min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; ix = n1 << 1; } -#endif +#endif while(i < n) { From cad0d150db22663ff60feff0b6764e7a97235bd0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2019 23:12:10 +0100 Subject: [PATCH 774/935] Define alternate kernels for big-endian POWER8 --- kernel/power/KERNEL.POWER8 | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c08f3fb00..fb9452a35 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ISAMAXKERNEL = isamax_power8.S +else +ISAMAXKERNEL = isamax.c +endif IDAMAXKERNEL = idamax.c +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ICAMAXKERNEL = icamax_power8.S +else +ICAMAXKERNEL = icamax.c +endif IZAMAXKERNEL = izamax.c # +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ISAMINKERNEL = isamin_power8.S +else +ISAMINKERNEL = isamin.c +endif IDAMINKERNEL = idamin.c +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ICAMINKERNEL = icamin_power8.S +else +ICAMINKERNEL = icamin.c +endif IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) CAXPYKERNEL = caxpy_power8.S +else +CAXPYKERNEL = caxpy.c +endif ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c From 0caf1434c928d39373499ffc02abe645945485d8 Mon Sep 17 00:00:00 2001 From: "Wang, Long" Date: Wed, 20 Nov 2019 11:50:37 +0800 Subject: [PATCH 775/935] Fix the integer overflow issue for large matrix size For large matrix, e.g. M=N=K, and M>1290, int mnk=M*N*K will overflow. This will lead to wrong branching to single-threading. The performance is downgraded significantly. Signed-off-by: Wang, Long --- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 2 +- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 3246e681f..31d82e3bf 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - int mnk = M * N * K; + unsigned long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 5d491237b..95963c0ac 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - int mnk = M * N * K; + unsigned long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; From 1f6071590d5b4fa3b52aa1456a9648ae354b2c7a Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 20 Nov 2019 12:21:35 +0100 Subject: [PATCH 776/935] Fix usage of TerminateThread() causing critical section corruption. This patch was submitted to the GIMP project by a publisher wishing to keep confidentiality (hence anonymously). I just pass along the patch. Here is the patch explanation which came with: First they remind us what Microsoft documentation says about TerminateThread: > TerminateThread is a dangerous function that should only be used in > the most extreme cases. You should call TerminateThread only if you > know exactly what the target thread is doing, and you control all of > the code that the target thread could possibly be running at the time > of the termination. (https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-terminatethread) Then they say that 5 milliseconds time-out might not be long enough for the thread to exit gracefully. They propose to set it to a much higher value (for instance here 5 seconds). And finally you should always check the return value of WaitForSingleObject(). In particular you want to run TerminateThread() only if WaitForSingleObject() failed, not on success case. --- driver/others/blas_server_win32.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bace54a23..e27725baf 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){ for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects - WaitForSingleObject(blas_threads[i], 5); //INFINITE); + DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000); + #ifndef OS_WINDOWSSTORE -// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP - TerminateThread(blas_threads[i],0); + // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP + if (WAIT_OBJECT_0 != wait_thread_value) { + TerminateThread(blas_threads[i],0); + } #endif + CloseHandle(blas_threads[i]); } From 1191db1a49b237e7c636616cb51ca0879d01c128 Mon Sep 17 00:00:00 2001 From: "Wang, Long" Date: Wed, 20 Nov 2019 21:30:16 +0800 Subject: [PATCH 777/935] For the sake of windows compatible, used "unsigned long long" to ensure 64-bit length Signed-off-by: Wang, Long --- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 4 ++-- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 31d82e3bf..4177ae2dc 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc) { - unsigned long M = m, N = n, K = k; + unsigned long long M = m, N = n, K = k; if (M == 0) return 0; if (N == 0) @@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict STORE_SCALAR(0, 0); } } -} \ No newline at end of file +} diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 95963c0ac..ee3417505 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - unsigned long mnk = M * N * K; + unsigned long long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; From bfb5fbdb4d6660d539c5f1cf42c44aadb446b3e6 Mon Sep 17 00:00:00 2001 From: "Wang, Long" Date: Thu, 21 Nov 2019 10:19:40 +0800 Subject: [PATCH 778/935] revised fix windows compatible for #2313 Signed-off-by: Wang, Long --- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 4177ae2dc..76b82e65b 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - unsigned long mnk = M * N * K; + unsigned long long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; From 883c39773a178048be81dac9f9110dd602e562f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dan=20Hor=C3=A1k?= Date: Thu, 21 Nov 2019 12:49:54 +0100 Subject: [PATCH 779/935] zarch: treat z15 as z14 instead of generic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Dan Horák --- cpuid_zarch.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 896ed94f5..872d846e1 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -30,17 +30,20 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 #define CPU_Z14 2 +#define CPU_Z15 3 static char *cpuname[] = { "ZARCH_GENERIC", "Z13", - "Z14" + "Z14", + "Z15" }; static char *cpuname_lower[] = { "zarch_generic", "z13", - "z14" + "z14", + "z15" }; int detect(void) @@ -66,6 +69,7 @@ int detect(void) if (strstr(p, "2965")) return CPU_Z13; if (strstr(p, "3906")) return CPU_Z14; if (strstr(p, "3907")) return CPU_Z14; + if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 return CPU_GENERIC; } From d117dfd5059c4c1e21a89f23412ca05bb1536dab Mon Sep 17 00:00:00 2001 From: Andreas Arnez Date: Fri, 20 Sep 2019 18:32:47 +0200 Subject: [PATCH 780/935] Change bad usage of "asum" to "sum" in ZARCH versions of ?sum The ZARCH implementations of ?sum contain a cut & paste-error: An inline assembly argument is named "sum", but the assembly references "asum" instead. The mismatch causes a build error. This is fixed. --- kernel/zarch/csum.c | 2 +- kernel/zarch/dsum.c | 2 +- kernel/zarch/ssum.c | 2 +- kernel/zarch/zsum.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c index c0b8c6371..e9413da8e 100644 --- a/kernel/zarch/csum.c +++ b/kernel/zarch/csum.c @@ -88,7 +88,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vrepf %%v25,%%v24,2\n\t" "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" + "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c index 178bc3462..8d44873c0 100644 --- a/kernel/zarch/dsum.c +++ b/kernel/zarch/dsum.c @@ -86,7 +86,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v31\n\t" "vrepg %%v25,%%v24,1\n\t" "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" + "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c index a433ab592..3f3f46a85 100644 --- a/kernel/zarch/ssum.c +++ b/kernel/zarch/ssum.c @@ -89,7 +89,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vrepf %%v25,%%v24,2\n\t" "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" + "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c index 7cfc1f17f..e0f978d87 100644 --- a/kernel/zarch/zsum.c +++ b/kernel/zarch/zsum.c @@ -87,7 +87,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v31\n\t" "vrepg %%v25,%%v24,1\n\t" "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" + "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", From 04226f1e97dd30b2e757893d230a5b1d67017b0d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 21 Nov 2019 18:14:29 +0100 Subject: [PATCH 781/935] Add the cpuid of the business/rackmount version of z15 as well --- cpuid_zarch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 872d846e1..df3b7898f 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -70,6 +70,7 @@ int detect(void) if (strstr(p, "3906")) return CPU_Z14; if (strstr(p, "3907")) return CPU_Z14; if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 + if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 return CPU_GENERIC; } From f3065a0eedb4d0e6a48d4009cbc447f127fbb5b8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Nov 2019 19:54:56 +0100 Subject: [PATCH 782/935] Fix race conditions in multithreaded GEMM3M by adding barriers (and a mutex lock for the non-OpenMP case) like it was already done for GEMM in level3_thread.c some time ago --- driver/level3/level3_gemm3m_thread.c | 53 ++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 4903aa5bd..21d431b60 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -408,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); @@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - } + WMB; + } current = mypos; @@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); /* thread has to wait */ - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); @@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + WMB; } } } while (current != mypos); @@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + WMB; } } @@ -541,7 +544,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); @@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); /* thread has to wait */ - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); @@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + WMB; } } } while (current != mypos); @@ -677,7 +681,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); @@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); /* thread has to wait */ - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); @@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; - } + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; +} } } while (current != mypos); @@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; } } @@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;}; } } @@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ +#ifndef USE_OPENMP +#ifndef OS_WINDOWS +static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#else +CRITICAL_SECTION level3_lock; +InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + blas_arg_t newarg; blas_queue_t queue[MAX_CPU_NUMBER]; @@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; #endif +#ifndef USE_OPENMP +#ifndef OS_WINDOWS +pthread_mutex_lock(&level3_lock); +#else +EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + newarg.m = args -> m; newarg.n = args -> n; newarg.k = args -> k; @@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG free(job); #endif +#ifndef USE_OPENMP +#ifndef OS_WINDOWS + pthread_mutex_unlock(&level3_lock); +#else + LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + return 0; } From f95989cbc1d0c8dedbded1183103569cd64584fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Nov 2019 22:38:07 +0100 Subject: [PATCH 783/935] Fix AVX512 capability test (always returning zero) from #2322 --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index f1cd3c6e6..a4ff0e086 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -329,7 +329,7 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 1){ + if((ebx & (1<<7)) == 0){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ From eb1e9c8c928f710170532ece33c4f183aea87ea2 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 26 Nov 2019 14:12:20 +0800 Subject: [PATCH 784/935] some optimizations --- kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 160 +++++++++----------- 1 file changed, 75 insertions(+), 85 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index 72878acfd..51b0b94fa 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -2,7 +2,8 @@ #include #include -//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. + /* row-major c_block */ #define INNER_KERNEL_k1m1n8 \ "prefetcht0 384(%1);"\ @@ -13,18 +14,6 @@ INNER_KERNEL_k1m1n8\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" -#define INNER_KERNEL_k1m4n8 \ - INNER_KERNEL_k1m2n8\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" - -#define INNER_KERNEL_k1m8n8 \ - INNER_KERNEL_k1m4n8\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" - #define INNER_KERNEL_k1m1n16 \ "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ @@ -34,18 +23,6 @@ INNER_KERNEL_k1m1n16\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" -#define INNER_KERNEL_k1m4n16 \ - INNER_KERNEL_k1m2n16\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" - -#define INNER_KERNEL_k1m8n16 \ - INNER_KERNEL_k1m4n16\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" - #define INNER_KERNEL_k1m1n24 \ "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ @@ -55,18 +32,48 @@ INNER_KERNEL_k1m1n24\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" +/* row-major z-partition c_block */ +#define INNER_KERNEL_k1m4n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" + #define INNER_KERNEL_k1m4n24 \ - INNER_KERNEL_k1m2n24\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" + INNER_KERNEL_k1m4n16\ + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" -#define INNER_KERNEL_k1m8n24 \ - INNER_KERNEL_k1m4n24\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" +#define INNER_KERNEL_k1m8n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ + "prefetcht0 128(%1);"\ + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m8n8\ + "prefetcht0 128(%1,%%r12,2);"\ + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m8n16\ + "prefetcht0 128(%1,%%r12,4);"\ + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" + +/* micro kernels */ #define INNER_KERNELm1(nn) \ "cmpq $1,%2;jb "#nn"3f;"\ #nn"4:\n\t"\ @@ -84,7 +91,7 @@ #define INNER_KERNELm4(nn) \ "cmpq $1,%2;jb "#nn"00f;"\ #nn"01:\n\t"\ - INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ #nn"00:\n\t" @@ -92,18 +99,18 @@ #define INNER_KERNELm8(nn) \ "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ #nn"008:\n\t"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "prefetcht1 (%11); addq $16,%11;"\ "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ "movq %3,%10;"\ #nn"001:\n\t"\ "cmpq $1,%2;jb "#nn"000f;"\ "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "decq %2;jmp "#nn"001b;"\ ""#nn"000:\n\t" @@ -207,24 +214,19 @@ INNER_STORE_m1n8(%%zmm13,8) #define INNER_TRANS_4x8(c1,c2,c3,c4) \ - "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ - "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ - "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ - "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ - "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ - "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ + +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - INNER_TRANS_4x8(c1,c2,c3,c4)\ - INNER_TRANS_4x8(c5,c6,c7,c8)\ - "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ - "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ - "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ - "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ - "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ - "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ - "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ - "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) //%7 for k01(input) only when m=4 #define INNER_STORE_4x8(c1,c2,c3,c4) \ @@ -250,20 +252,14 @@ INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) #define INNER_SAVE_m4n16 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ - INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ - INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ - INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) + INNER_SAVE_m4n8\ + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) #define INNER_SAVE_m4n24 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ - INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ - INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ - INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ - INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ - INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) + INNER_SAVE_m4n16\ + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) #define INNER_SAVE_m8n8 \ "movq %3,%10;"\ @@ -271,20 +267,14 @@ INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) #define INNER_SAVE_m8n16 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ - INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ - INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ - INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) + INNER_SAVE_m8n8\ + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) #define INNER_SAVE_m8n24 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ - INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ - INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ - INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ - INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ - INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) + INNER_SAVE_m8n16\ + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) #define COMPUTE_n8 {\ b_pref = packed_b_pointer + 8 * K;\ @@ -327,7 +317,7 @@ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ @@ -372,7 +362,7 @@ "leaq (%1,%%r12,4),%1;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } @@ -417,9 +407,9 @@ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 From cf2a8e410cc095b40d3b357e74a5f77af83ce602 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 26 Nov 2019 21:55:04 -0700 Subject: [PATCH 785/935] Fix SEGV in cdot_power9 We were corrupting r2 because the local entry wasn't being setup correctly. --- kernel/power/cdot_power9.S | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S index 9ec7cdd85..6ca7a02a5 100644 --- a/kernel/power/cdot_power9.S +++ b/kernel/power/cdot_power9.S @@ -13,10 +13,7 @@ cdot_k: .LCF0: -0: addis 2,12,.TOC.-.LCF0@ha - addi 2,2,.TOC.-.LCF0@l - .localentry cdot_k,.-cdot_k - mr. 9,3 +0: mr. 9,3 ble 0,.L10 cmpdi 7,5,1 beq 7,.L18 From 6bc487de356f3b40412b0bca3ad950d4b5da38b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Nov 2019 15:10:57 +0100 Subject: [PATCH 786/935] Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now Travis recently appears unable to find a matching homebrew package for 32bit gfortran, and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined" warnings when CFLAGS is set in the environment --- .travis.yml | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6016ec1fe..fb6006474 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,10 +4,11 @@ dist: precise sudo: true language: c -matrix: +jobs: include: - &test-ubuntu os: linux + stage: test compiler: gcc addons: apt: @@ -17,7 +18,7 @@ matrix: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - set -e - - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -25,15 +26,6 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" - - <<: *test-ubuntu - os: linux-ppc64le - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" - env: - # for matrix annotation only - - TARGET_BOX=PPC64LE_LINUX - - BTYPE="BINARY=64 USE_OPENMP=1" - - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 @@ -67,6 +59,7 @@ matrix: - BTYPE="BINARY=32" - os: linux + stage: test compiler: gcc addons: apt: @@ -87,12 +80,13 @@ matrix: # that don't require sudo. - &test-alpine os: linux + stage: test dist: trusty sudo: true language: minimal before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ + && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' @@ -126,10 +120,11 @@ matrix: - <<: *test-alpine env: - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" + - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - &test-cmake os: linux + stage: test compiler: clang addons: apt: @@ -156,30 +151,17 @@ matrix: env: - CMAKE=1 - - &test-macos + - osx-gcc os: osx - osx_image: xcode10.1 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + stage: test + osx_image: xcode8 + before_script: *common-before - brew update - - brew install gcc@8 # for gfortran + - brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8" - - - <<: *test-macos - osx_image: xcode8.3 - env: - - BTYPE="BINARY=32 FC=gfortran-8" - - - <<: *test-macos - osx_image: xcode10.1 - env: - - COMMON_FLAGS="NUM_THREADS=32" - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" - - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang" + - BTYPE="BINARY=64 INTERFACE64=1" # whitelist branches: From 83dae28ae25b05d3e607f4f1112c369ce2f2e653 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Nov 2019 00:09:06 +0100 Subject: [PATCH 787/935] Revert "Cleanup Travis IOS xbuild and disable FORTRAN on 32bit and ios builds for now" --- .travis.yml | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index fb6006474..6016ec1fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,11 +4,10 @@ dist: precise sudo: true language: c -jobs: +matrix: include: - &test-ubuntu os: linux - stage: test compiler: gcc addons: apt: @@ -18,7 +17,7 @@ jobs: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - set -e - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -26,6 +25,15 @@ jobs: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" + - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 @@ -59,7 +67,6 @@ jobs: - BTYPE="BINARY=32" - os: linux - stage: test compiler: gcc addons: apt: @@ -80,13 +87,12 @@ jobs: # that don't require sudo. - &test-alpine os: linux - stage: test dist: trusty sudo: true language: minimal before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ - && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' @@ -120,11 +126,10 @@ jobs: - <<: *test-alpine env: - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" + - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" - &test-cmake os: linux - stage: test compiler: clang addons: apt: @@ -151,17 +156,30 @@ jobs: env: - CMAKE=1 - - osx-gcc + - &test-macos os: osx - stage: test - osx_image: xcode8 - before_script: *common-before + osx_image: xcode10.1 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - brew install gcc # for gfortran + - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="BINARY=64 INTERFACE64=1" + - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8" + + - <<: *test-macos + osx_image: xcode8.3 + env: + - BTYPE="BINARY=32 FC=gfortran-8" + + - <<: *test-macos + osx_image: xcode10.1 + env: + - COMMON_FLAGS="NUM_THREADS=32" + - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" + - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang" # whitelist branches: From ae2a0995ccaefc20a3476a4b232b37d7f37ad794 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Nov 2019 00:15:36 +0100 Subject: [PATCH 788/935] Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now Travis recently appears unable to find a matching homebrew package for 32bit gfortran, and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined" warnings when CFLAGS is set in the environment --- .travis.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6016ec1fe..9e18412e8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -160,26 +160,25 @@ matrix: os: osx osx_image: xcode10.1 before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - brew update - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" - <<: *test-macos - osx_image: xcode8.3 + osx_image: xcode10.0 env: - - BTYPE="BINARY=32 FC=gfortran-8" + - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - <<: *test-macos osx_image: xcode10.1 env: - - COMMON_FLAGS="NUM_THREADS=32" - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" - - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang" + - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" # whitelist branches: From 934e601e934f5cf930382dfd9d7e92b937d1d2ed Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 28 Nov 2019 19:56:35 +0800 Subject: [PATCH 789/935] Update dgemm_kernel_4x8_skylakex_2.c --- kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index 51b0b94fa..90a4c2b1d 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -97,15 +97,17 @@ /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ #define INNER_KERNELm8(nn) \ - "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ + "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ #nn"008:\n\t"\ INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "prefetcht1 (%11); addq $16,%11;"\ - "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%11); addq $32,%11;"\ + "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ "movq %3,%10;"\ #nn"001:\n\t"\ "cmpq $1,%2;jb "#nn"000f;"\ From e20709e976ba668141f07d47bf1152a6e948b729 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 28 Nov 2019 19:57:50 +0800 Subject: [PATCH 790/935] Update param.h --- param.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/param.h b/param.h index 9dc94c420..d39fc4a1d 100644 --- a/param.h +++ b/param.h @@ -1691,16 +1691,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 168 #endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 From 97762234f9517f1ae90fc97a4456cd0923c30319 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 Nov 2019 23:47:23 +0100 Subject: [PATCH 791/935] Add variable for gcc >=9 test used in KERNEL.POWER9 --- kernel/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index e81225075..9b468a6af 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,11 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system + +ifeq ($(C_COMPILER), GCC) +GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +endif + AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 From a9b62c03f852a38cc2171a652b93a673591c483b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 Nov 2019 23:49:50 +0100 Subject: [PATCH 792/935] Substitute precompiled gcc7 codes only when gcc is older than 9.x --- kernel/power/KERNEL.POWER9 | 392 ++++++++++++++++++++----------------- 1 file changed, 208 insertions(+), 184 deletions(-) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 2ed843fff..4bfa017e1 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -1,184 +1,208 @@ -#SGEMM_BETA = ../generic/gemm_beta.c -#DGEMM_BETA = ../generic/gemm_beta.c -#CGEMM_BETA = ../generic/zgemm_beta.c -#ZGEMM_BETA = ../generic/zgemm_beta.c - -STRMMKERNEL = sgemm_kernel_power9.S -DTRMMKERNEL = dgemm_kernel_power9.S -CTRMMKERNEL = cgemm_kernel_power9.S -ZTRMMKERNEL = zgemm_kernel_power9.S - -SGEMMKERNEL = sgemm_kernel_power9.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c -SGEMMITCOPY = sgemm_tcopy_16_power8.S -SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_power9.S -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = dgemm_ncopy_4_power8.S -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_power9.S -CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8.c -CGEMMONCOPY = ../generic/zgemm_ncopy_4.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_power9.S -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = zgemm_tcopy_8_power8.S -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. -#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S -#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S - -#Pure C for other kernels -#SAMAXKERNEL = ../arm/amax.c -#DAMAXKERNEL = ../arm/amax.c -#CAMAXKERNEL = ../arm/zamax.c -#ZAMAXKERNEL = ../arm/zamax.c -# -#SAMINKERNEL = ../arm/amin.c -#DAMINKERNEL = ../arm/amin.c -#CAMINKERNEL = ../arm/zamin.c -#ZAMINKERNEL = ../arm/zamin.c -# -#SMAXKERNEL = ../arm/max.c -#DMAXKERNEL = ../arm/max.c -# -#SMINKERNEL = ../arm/min.c -#DMINKERNEL = ../arm/min.c -# -ISAMAXKERNEL = isamax_power9.S -IDAMAXKERNEL = idamax.c -ICAMAXKERNEL = icamax_power9.S -IZAMAXKERNEL = izamax.c -# -ISAMINKERNEL = isamin_power9.S -IDAMINKERNEL = idamin.c -ICAMINKERNEL = icamin_power9.S -IZAMINKERNEL = izamin.c -# -#ISMAXKERNEL = ../arm/imax.c -#IDMAXKERNEL = ../arm/imax.c -# -#ISMINKERNEL = ../arm/imin.c -#IDMINKERNEL = ../arm/imin.c -# -SASUMKERNEL = sasum.c -DASUMKERNEL = dasum.c -CASUMKERNEL = casum.c -ZASUMKERNEL = zasum.c -# -SAXPYKERNEL = saxpy.c -DAXPYKERNEL = daxpy.c -CAXPYKERNEL = caxpy_power9.S -ZAXPYKERNEL = zaxpy.c -# -SCOPYKERNEL = scopy.c -DCOPYKERNEL = dcopy.c -CCOPYKERNEL = ccopy.c -ZCOPYKERNEL = zcopy.c -# -SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c -DSDOTKERNEL = sdot.c -CDOTKERNEL = cdot_power9.S -ZDOTKERNEL = zdot.c -# -SNRM2KERNEL = ../arm/nrm2.c -DNRM2KERNEL = ../arm/nrm2.c -CNRM2KERNEL = ../arm/znrm2.c -ZNRM2KERNEL = ../arm/znrm2.c -# -SROTKERNEL = srot.c -DROTKERNEL = drot.c -CROTKERNEL = crot.c -ZROTKERNEL = zrot.c -# -SSCALKERNEL = sscal.c -DSCALKERNEL = dscal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c -# -SSWAPKERNEL = sswap.c -DSWAPKERNEL = dswap.c -CSWAPKERNEL = cswap.c -ZSWAPKERNEL = zswap.c -# - -SGEMVNKERNEL = sgemv_n.c -DGEMVNKERNEL = dgemv_n.c -CGEMVNKERNEL = cgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c -# -SGEMVTKERNEL = sgemv_t.c -DGEMVTKERNEL = dgemv_t.c -CGEMVTKERNEL = cgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c - - -#SSYMV_U_KERNEL = ../generic/symv_k.c -#SSYMV_L_KERNEL = ../generic/symv_k.c -#DSYMV_U_KERNEL = ../generic/symv_k.c -#DSYMV_L_KERNEL = ../generic/symv_k.c -#QSYMV_U_KERNEL = ../generic/symv_k.c -#QSYMV_L_KERNEL = ../generic/symv_k.c -#CSYMV_U_KERNEL = ../generic/zsymv_k.c -#CSYMV_L_KERNEL = ../generic/zsymv_k.c -#ZSYMV_U_KERNEL = ../generic/zsymv_k.c -#ZSYMV_L_KERNEL = ../generic/zsymv_k.c -#XSYMV_U_KERNEL = ../generic/zsymv_k.c -#XSYMV_L_KERNEL = ../generic/zsymv_k.c - -#ZHEMV_U_KERNEL = ../generic/zhemv_k.c -#ZHEMV_L_KERNEL = ../generic/zhemv_k.c - -LSAME_KERNEL = ../generic/lsame.c -SCABS_KERNEL = ../generic/cabs.c -DCABS_KERNEL = ../generic/cabs.c -QCABS_KERNEL = ../generic/cabs.c - -#Dump kernel -CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +#SGEMM_BETA = ../generic/gemm_beta.c +#DGEMM_BETA = ../generic/gemm_beta.c +#CGEMM_BETA = ../generic/zgemm_beta.c +#ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = sgemm_kernel_power9.S +DTRMMKERNEL = dgemm_kernel_power9.S +CTRMMKERNEL = cgemm_kernel_power9.S +ZTRMMKERNEL = zgemm_kernel_power9.S + +SGEMMKERNEL = sgemm_kernel_power9.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_power8.S +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_power9.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_power9.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +#SAMAXKERNEL = ../arm/amax.c +#DAMAXKERNEL = ../arm/amax.c +#CAMAXKERNEL = ../arm/zamax.c +#ZAMAXKERNEL = ../arm/zamax.c +# +#SAMINKERNEL = ../arm/amin.c +#DAMINKERNEL = ../arm/amin.c +#CAMINKERNEL = ../arm/zamin.c +#ZAMINKERNEL = ../arm/zamin.c +# +#SMAXKERNEL = ../arm/max.c +#DMAXKERNEL = ../arm/max.c +# +#SMINKERNEL = ../arm/min.c +#DMINKERNEL = ../arm/min.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMAXKERNEL = isamax_power9.S +else +ISAMAXKERNEL = isamax.c +endif +IDAMAXKERNEL = idamax.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMAXKERNEL = icamax_power9.S +else +ICAMAXKERNEL = icamax.c +endif +IZAMAXKERNEL = izamax.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMINKERNEL = isamin_power9.S +else +ISAMINKERNEL = isamin.c +endif +IDAMINKERNEL = idamin.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMINKERNEL = icamin_power9.S +else +ICAMINKERNEL = icamin.c +endif +IZAMINKERNEL = izamin.c +# +#ISMAXKERNEL = ../arm/imax.c +#IDMAXKERNEL = ../arm/imax.c +# +#ISMINKERNEL = ../arm/imin.c +#IDMINKERNEL = ../arm/imin.c +# +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c +# +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +ifneq ($(GCCVERSIONGTEQ9),1) +CAXPYKERNEL = caxpy_power9.S +else +CAXPYKERNEL = caxpy.c +endif +ZAXPYKERNEL = zaxpy.c +# +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c +# +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +DSDOTKERNEL = sdot.c +ifneq ($(GCCVERSIONGTEQ9),1) +CDOTKERNEL = cdot_power9.S +else +CDOTKERNEL = cdot.c +endif +ZDOTKERNEL = zdot.c +# +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c +# +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c +# +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c +# +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c +# + +SGEMVNKERNEL = sgemv_n.c +DGEMVNKERNEL = dgemv_n.c +CGEMVNKERNEL = cgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c +# +SGEMVTKERNEL = sgemv_t.c +DGEMVTKERNEL = dgemv_t.c +CGEMVTKERNEL = cgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c + + +#SSYMV_U_KERNEL = ../generic/symv_k.c +#SSYMV_L_KERNEL = ../generic/symv_k.c +#DSYMV_U_KERNEL = ../generic/symv_k.c +#DSYMV_L_KERNEL = ../generic/symv_k.c +#QSYMV_U_KERNEL = ../generic/symv_k.c +#QSYMV_L_KERNEL = ../generic/symv_k.c +#CSYMV_U_KERNEL = ../generic/zsymv_k.c +#CSYMV_L_KERNEL = ../generic/zsymv_k.c +#ZSYMV_U_KERNEL = ../generic/zsymv_k.c +#ZSYMV_L_KERNEL = ../generic/zsymv_k.c +#XSYMV_U_KERNEL = ../generic/zsymv_k.c +#XSYMV_L_KERNEL = ../generic/zsymv_k.c + +#ZHEMV_U_KERNEL = ../generic/zhemv_k.c +#ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c From 2181fb7047f87f66ae1584c8af4e66e766b31b53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 Nov 2019 23:54:15 +0100 Subject: [PATCH 793/935] Fix caxpy/caxpyc naming in localentry --- kernel/power/caxpy_power8.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index b5f841d2e..294a1d24d 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -16,7 +16,11 @@ 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l #if _CALL_ELF ==2 +#ifdef CONJ + .localentry caxpyc_k,.-caxpyc_k +#else .localentry caxpy_k,.-caxpy_k +#endif #endif mr. 7,3 ble 0,.L33 @@ -517,7 +521,11 @@ .long 0 .byte 0,0,0,0,0,4,0,0 #if _CALL_ELF ==2 +#ifdef CONJ + .size caxpyc_k,.-caxpyc_k +#else .size caxpy_k,.-caxpy_k +#endif #endif .section .rodata .align 4 From dedd822d1aeb2315e44e47e97167ae8a02c9c9ff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 Nov 2019 23:56:57 +0100 Subject: [PATCH 794/935] Fix caxpy/caxpyc naming in localentry --- kernel/power/caxpy_power9.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/power/caxpy_power9.S b/kernel/power/caxpy_power9.S index 48e6e5ba3..844cacd50 100644 --- a/kernel/power/caxpy_power9.S +++ b/kernel/power/caxpy_power9.S @@ -17,7 +17,11 @@ caxpy_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#ifdef CONJ + .localentry caxpyc_k,.-caxpyc_k +#else .localentry caxpy_k,.-caxpy_k +#endif mr. 7,3 ble 0,.L33 cmpdi 7,9,1 @@ -474,7 +478,11 @@ caxpy_k: b .L13 .long 0 .byte 0,0,0,0,0,1,0,0 +#ifdef CONJ + .size caxpyc_k,.-caxpyc_k +#else .size caxpy_k,.-caxpy_k +#endif .section .rodata .align 4 .set .LANCHOR0,. + 0 From b863b32ac5598e96b76d5783ae3a96c2b58e1712 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 1 Dec 2019 11:55:49 -0600 Subject: [PATCH 795/935] Workaround an ICE in clang 9.0.0 This bug is not there in 8.x nor in the 9.0 daily snapshot. --- kernel/x86_64/dsymv_L_microk_skylakex-2.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c index 8244dffa1..bdcd914fb 100644 --- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -33,6 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x4 1 +#if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 +#pragma clang optimize off +#endif + static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) { @@ -155,7 +159,12 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL temp2[1] += half_accum1[0]; temp2[2] += half_accum2[0]; temp2[3] += half_accum3[0]; -} +} + +#if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 +#pragma clang optimize on +#endif + #else #include "dsymv_L_microk_haswell-2.c" -#endif \ No newline at end of file +#endif From 715f4650d9874badfede90e4bd09451ac8ea1886 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Dec 2019 08:24:10 +0100 Subject: [PATCH 796/935] Delete stray copy of dynamic.c from PR 2228 --- dynamic.c | 897 ------------------------------------------------------ 1 file changed, 897 deletions(-) delete mode 100644 dynamic.c diff --git a/dynamic.c b/dynamic.c deleted file mode 100644 index aa2b87621..000000000 --- a/dynamic.c +++ /dev/null @@ -1,897 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include "common.h" - -#ifdef _MSC_VER -#define strncasecmp _strnicmp -#define strcasecmp _stricmp -#endif - -#ifdef ARCH_X86 -#define EXTERN extern -#else -#define EXTERN -#endif - -#ifdef DYNAMIC_LIST -extern gotoblas_t gotoblas_PRESCOTT; - -#ifdef DYN_ATHLON -extern gotoblas_t gotoblas_ATHLON; -#else -#define gotoblas_ATHLON gotoblas_PRESCOTT -#endif -#ifdef DYN_KATMAI -extern gotoblas_t gotoblas_KATMAI; -#else -#define gotoblas_KATMAI gotoblas_PRESCOTT -#endif -#ifdef DYN_BANIAS -extern gotoblas_t gotoblas_BANIAS; -#else -#define gotoblas_BANIAS gotoblas_PRESCOTT -#endif -#ifdef DYN_COPPERMINE -extern gotoblas_t gotoblas_COPPERMINE; -#else -#define gotoblas_COPPERMINE gotoblas_PRESCOTT -#endif -#ifdef DYN_NORTHWOOD -extern gotoblas_t gotoblas_NORTHWOOD; -#else -#define gotoblas_NORTHWOOD gotoblas_PRESCOTT -#endif -#ifdef DYN_CORE2 -extern gotoblas_t gotoblas_CORE2; -#else -#define gotoblas_CORE2 gotoblas_PRESCOTT -#endif -#ifdef DYN_NEHALEM -extern gotoblas_t gotoblas_NEHALEM; -#else -#define gotoblas_NEHALEM gotoblas_PRESCOTT -#endif -#ifdef DYN_BARCELONA -extern gotoblas_t gotoblas_BARCELONA; -#elif defined(DYN_NEHALEM) -#define gotoblas_BARCELONA gotoblas_NEHALEM -#else -#define gotoblas_BARCELONA gotoblas_PRESCOTT -#endif -#ifdef DYN_ATOM -extern gotoblas_t gotoblas_ATOM; -elif defined(DYN_NEHALEM) -#define gotoblas_ATOM gotoblas_NEHALEM -#else -#define gotoblas_ATOM gotoblas_PRESCOTT -#endif -#ifdef DYN_NANO -extern gotoblas_t gotoblas_NANO; -#else -#define gotoblas_NANO gotoblas_PRESCOTT -#endif -#ifdef DYN_PENRYN -extern gotoblas_t gotoblas_PENRYN; -#else -#define gotoblas_PENRYN gotoblas_PRESCOTT -#endif -#ifdef DYN_DUNNINGTON -extern gotoblas_t gotoblas_DUNNINGTON; -#else -#define gotoblas_DUNNINGTON gotoblas_PRESCOTT -#endif -#ifdef DYN_OPTERON -extern gotoblas_t gotoblas_OPTERON; -#else -#define gotoblas_OPTERON gotoblas_PRESCOTT -#endif -#ifdef DYN_OPTERON_SSE3 -extern gotoblas_t gotoblas_OPTERON_SSE3; -#else -#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT -#endif -#ifdef DYN_BOBCAT -extern gotoblas_t gotoblas_BOBCAT; -#elif defined(DYN_NEHALEM) -#define gotoblas_BOBCAT gotoblas_NEHALEM -#else -#define gotoblas_BOBCAT gotoblas_PRESCOTT -#endif -#ifdef DYN_SANDYBRIDGE -extern gotoblas_t gotoblas_SANDYBRIDGE; -#elif defined(DYN_NEHALEM) -#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM -#else -#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT -#endif -#ifdef DYN_BULLDOZER -extern gotoblas_t gotoblas_BULLDOZER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_BULLDOZER gotoblas_NEHALEM -#else -#define gotoblas_BULLDOZER gotoblas_PRESCOTT -#endif -#ifdef DYN_PILEDRIVER -extern gotoblas_t gotoblas_PILEDRIVER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_PILEDRIVER gotoblas_NEHALEM -#else -#define gotoblas_PILEDRIVER gotoblas_PRESCOTT -#endif -#ifdef DYN_STEAMROLLER -extern gotoblas_t gotoblas_STEAMROLLER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_STEAMROLLER gotoblas_NEHALEM -#else -#define gotoblas_STEAMROLLER gotoblas_PRESCOTT -#endif -#ifdef DYN_EXCAVATOR -extern gotoblas_t gotoblas_EXCAVATOR; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_EXCAVATOR gotoblas_NEHALEM -#else -#define gotoblas_EXCAVATOR gotoblas_PRESCOTT -#endif -#ifdef DYN_HASWELL -extern gotoblas_t gotoblas_HASWELL; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_HASWELL gotoblas_NEHALEM -#else -#define gotoblas_HASWELL gotoblas_PRESCOTT -#endif -#ifdef DYN_ZEN -extern gotoblas_t gotoblas_ZEN; -#elif defined(DYN_HASWELL) -#define gotoblas_ZEN gotoblas_HASWELL -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_ZEN gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_ZEN gotoblas_NEHALEM -#else -#define gotoblas_ZEN gotoblas_PRESCOTT -#endif -#ifdef DYN_SKYLAKEX -extern gotoblas_t gotoblas_SKYLAKEX; -#elif defined(DYN_HASWELL) -#define gotoblas_SKYLAKEX gotoblas_HASWELL -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_SKYLAKEX gotoblas_NEHALEM -#else -#define gotoblas_SKYLAKEX gotoblas_PRESCOTT -#endif - - -#else // not DYNAMIC_LIST -EXTERN gotoblas_t gotoblas_KATMAI; -EXTERN gotoblas_t gotoblas_COPPERMINE; -EXTERN gotoblas_t gotoblas_NORTHWOOD; -EXTERN gotoblas_t gotoblas_BANIAS; -EXTERN gotoblas_t gotoblas_ATHLON; - -extern gotoblas_t gotoblas_PRESCOTT; -extern gotoblas_t gotoblas_CORE2; -extern gotoblas_t gotoblas_NEHALEM; -extern gotoblas_t gotoblas_BARCELONA; -#ifdef DYNAMIC_OLDER -extern gotoblas_t gotoblas_ATOM; -extern gotoblas_t gotoblas_NANO; -extern gotoblas_t gotoblas_PENRYN; -extern gotoblas_t gotoblas_DUNNINGTON; -extern gotoblas_t gotoblas_OPTERON; -extern gotoblas_t gotoblas_OPTERON_SSE3; -extern gotoblas_t gotoblas_BOBCAT; -#else -#define gotoblas_ATOM gotoblas_NEHALEM -#define gotoblas_NANO gotoblas_NEHALEM -#define gotoblas_PENRYN gotoblas_CORE2 -#define gotoblas_DUNNINGTON gotoblas_CORE2 -#define gotoblas_OPTERON gotoblas_CORE2 -#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 -#define gotoblas_BOBCAT gotoblas_CORE2 -#endif - -#ifndef NO_AVX -extern gotoblas_t gotoblas_SANDYBRIDGE; -extern gotoblas_t gotoblas_BULLDOZER; -extern gotoblas_t gotoblas_PILEDRIVER; -extern gotoblas_t gotoblas_STEAMROLLER; -extern gotoblas_t gotoblas_EXCAVATOR; -#ifdef NO_AVX2 -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE -#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE -#define gotoblas_ZEN gotoblas_SANDYBRIDGE -#else -extern gotoblas_t gotoblas_HASWELL; -extern gotoblas_t gotoblas_ZEN; -#ifndef NO_AVX512 -extern gotoblas_t gotoblas_SKYLAKEX; -#else -#define gotoblas_SKYLAKEX gotoblas_HASWELL -#endif -#endif -#else -//Use NEHALEM kernels for sandy bridge -#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM -#define gotoblas_HASWELL gotoblas_NEHALEM -#define gotoblas_SKYLAKEX gotoblas_NEHALEM -#define gotoblas_BULLDOZER gotoblas_BARCELONA -#define gotoblas_PILEDRIVER gotoblas_BARCELONA -#define gotoblas_STEAMROLLER gotoblas_BARCELONA -#define gotoblas_EXCAVATOR gotoblas_BARCELONA -#define gotoblas_ZEN gotoblas_BARCELONA -#endif - -#endif // DYNAMIC_LIST - -#define VENDOR_INTEL 1 -#define VENDOR_AMD 2 -#define VENDOR_CENTAUR 3 -#define VENDOR_HYGON 4 -#define VENDOR_UNKNOWN 99 - -#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) - -#ifndef NO_AVX -static inline void xgetbv(int op, int * eax, int * edx){ - //Use binary code for xgetbv - __asm__ __volatile__ - (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); -} -#endif - -int support_avx(){ -#ifndef NO_AVX - int eax, ebx, ecx, edx; - int ret=0; - - cpuid(1, &eax, &ebx, &ecx, &edx); - if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ - xgetbv(0, &eax, &edx); - if((eax & 6) == 6){ - ret=1; //OS support AVX - } - } - return ret; -#else - return 0; -#endif -} - -int support_avx2(){ -#ifndef NO_AVX2 - int eax, ebx, ecx=0, edx; - int ret=0; - - if (!support_avx()) - return 0; - cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 - return ret; -#else - return 0; -#endif -} - -int support_avx512(){ -#if !defined(NO_AVX) && !defined(NO_AVX512) - int eax, ebx, ecx, edx; - int ret=0; - - if (!support_avx()) - return 0; - cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 1){ - ret=0; //OS does not even support AVX2 - } - if((ebx & (1<<31)) != 0){ - xgetbv(0, &eax, &edx); - if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL - } - return ret; -#else - return 0; -#endif -} - -extern void openblas_warning(int verbose, const char * msg); -#define FALLBACK_VERBOSE 1 -#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" -#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" -#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" -#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" - -static int get_vendor(void){ - int eax, ebx, ecx, edx; - - union - { - char vchar[16]; - int vint[4]; - } vendor; - - cpuid(0, &eax, &ebx, &ecx, &edx); - - *(&vendor.vint[0]) = ebx; - *(&vendor.vint[1]) = edx; - *(&vendor.vint[2]) = ecx; - - vendor.vchar[12] = '\0'; - - if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; - if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; - if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; - if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; - - if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; - - return VENDOR_UNKNOWN; -} - -static gotoblas_t *get_coretype(void){ - - int eax, ebx, ecx, edx; - int family, exfamily, model, vendor, exmodel; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - family = BITMASK(eax, 8, 0x0f); - exfamily = BITMASK(eax, 20, 0xff); - model = BITMASK(eax, 4, 0x0f); - exmodel = BITMASK(eax, 16, 0x0f); - - vendor = get_vendor(); - - if (vendor == VENDOR_INTEL){ - switch (family) { - case 0x6: - switch (exmodel) { - case 0: - if (model <= 0x7) return &gotoblas_KATMAI; - if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; - if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; - if (model == 14) return &gotoblas_BANIAS; - if (model == 15) return &gotoblas_CORE2; - return NULL; - - case 1: - if (model == 6) return &gotoblas_CORE2; - if (model == 7) return &gotoblas_PENRYN; - if (model == 13) return &gotoblas_DUNNINGTON; - if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; - if (model == 12) return &gotoblas_ATOM; - return NULL; - - case 2: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - if (model == 5) return &gotoblas_NEHALEM; - - //Intel Xeon Processor 5600 (Westmere-EP) - //Xeon Processor E7 (Westmere-EX) - //Xeon E7540 - if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; - - //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - //Intel Core i7-3000 / Xeon E5 - if (model == 10 || model == 13) { - if(support_avx()) - return &gotoblas_SANDYBRIDGE; - else{ - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - return NULL; - case 3: - //Intel Sandy Bridge 22nm (Ivy Bridge?) - if (model == 10 || model == 14) { - if(support_avx()) - return &gotoblas_SANDYBRIDGE; - else{ - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Haswell - if (model == 12 || model == 15) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Broadwell - if (model == 13) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - if (model == 7) return &gotoblas_ATOM; //Bay Trail - return NULL; - case 4: - //Intel Haswell - if (model == 5 || model == 6) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Broadwell - if (model == 7 || model == 15) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Skylake - if (model == 14) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Braswell / Avoton - if (model == 12 || model == 13) { - return &gotoblas_NEHALEM; - } - return NULL; - case 5: - //Intel Broadwell - if (model == 6) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - if (model == 5) { - // Intel Skylake X - if (support_avx512()) - return &gotoblas_SKYLAKEX; - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - //Intel Skylake - if (model == 14) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Phi Knights Landing - if (model == 7) { - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Apollo Lake or Denverton - if (model == 12 || model == 15) { - return &gotoblas_NEHALEM; - } - return NULL; - case 6: - if (model == 6) { - // Cannon Lake - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - return NULL; - case 7: - if (model == 10) // Goldmont plus - return &gotoblas_NEHALEM; - if (model == 14) { - // Ice Lake - if (support_avx512()) - return &gotoblas_SKYLAKEX; - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - return NULL; - case 9: - case 8: - if (model == 14 ) { // Kaby Lake, Coffee Lake - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - return NULL; - } - case 0xf: - if (model <= 0x2) return &gotoblas_NORTHWOOD; - return &gotoblas_PRESCOTT; - } - } - - if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ - if (family <= 0xe) { - // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if ( (eax & 0xffff) >= 0x01) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) - return NULL; - } - else - return NULL; - - return &gotoblas_ATHLON; - } - if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) { - if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; - else return &gotoblas_OPTERON; - } else if (exfamily == 5) { - return &gotoblas_BOBCAT; - } else if (exfamily == 6) { - if(model == 1){ - //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series - if(support_avx()) - return &gotoblas_BULLDOZER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 2 || model == 3){ - //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 - if(support_avx()) - return &gotoblas_PILEDRIVER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 5){ - if(support_avx()) - return &gotoblas_EXCAVATOR; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 0 || model == 8){ - if (exmodel == 1) { - //AMD Trinity - if(support_avx()) - return &gotoblas_PILEDRIVER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if (exmodel == 3) { - //AMD STEAMROLLER - if(support_avx()) - return &gotoblas_STEAMROLLER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if (exmodel == 6) { - if(support_avx()) - return &gotoblas_EXCAVATOR; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - - } - } - } else if (exfamily == 8) { - if (model == 1 || model == 8) { - if(support_avx()) - return &gotoblas_ZEN; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - } - } else if (exfamily == 9) { - if(support_avx()) - return &gotoblas_ZEN; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else { - return &gotoblas_BARCELONA; - } - } - } - - if (vendor == VENDOR_CENTAUR) { - switch (family) { - case 0x6: - return &gotoblas_NANO; - } - } - - return NULL; -} - -static char *corename[] = { - "Unknown", - "Katmai", - "Coppermine", - "Northwood", - "Prescott", - "Banias", - "Atom", - "Core2", - "Penryn", - "Dunnington", - "Nehalem", - "Athlon", - "Opteron", - "Opteron_SSE3", - "Barcelona", - "Nano", - "Sandybridge", - "Bobcat", - "Bulldozer", - "Piledriver", - "Haswell", - "Steamroller", - "Excavator", - "Zen", - "SkylakeX" -}; - -char *gotoblas_corename(void) { - - if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; - if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; - if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; - if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; - if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; - if (gotoblas == &gotoblas_ATOM) return corename[ 6]; - if (gotoblas == &gotoblas_CORE2) return corename[ 7]; - if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; - if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; - if (gotoblas == &gotoblas_NEHALEM) return corename[10]; - if (gotoblas == &gotoblas_ATHLON) return corename[11]; - if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; - if (gotoblas == &gotoblas_OPTERON) return corename[13]; - if (gotoblas == &gotoblas_BARCELONA) return corename[14]; - if (gotoblas == &gotoblas_NANO) return corename[15]; - if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; - if (gotoblas == &gotoblas_BOBCAT) return corename[17]; - if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; - if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; - if (gotoblas == &gotoblas_HASWELL) return corename[20]; - if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; - if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; - if (gotoblas == &gotoblas_ZEN) return corename[23]; - if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; - return corename[0]; -} - - -static gotoblas_t *force_coretype(char *coretype){ - - int i ; - int found = -1; - char message[128]; - //char mname[20]; - - for ( i=1 ; i <= 24; i++) - { - if (!strncasecmp(coretype,corename[i],20)) - { - found = i; - break; - } - } - if (found < 0) - { - //strncpy(mname,coretype,20); - snprintf(message, 128, "Core not found: %s\n",coretype); - openblas_warning(1, message); - return(NULL); - } - - switch (found) - { - case 24: return (&gotoblas_SKYLAKEX); - case 23: return (&gotoblas_ZEN); - case 22: return (&gotoblas_EXCAVATOR); - case 21: return (&gotoblas_STEAMROLLER); - case 20: return (&gotoblas_HASWELL); - case 19: return (&gotoblas_PILEDRIVER); - case 18: return (&gotoblas_BULLDOZER); - case 17: return (&gotoblas_BOBCAT); - case 16: return (&gotoblas_SANDYBRIDGE); - case 15: return (&gotoblas_NANO); - case 14: return (&gotoblas_BARCELONA); - case 13: return (&gotoblas_OPTERON); - case 12: return (&gotoblas_OPTERON_SSE3); - case 11: return (&gotoblas_ATHLON); - case 10: return (&gotoblas_NEHALEM); - case 9: return (&gotoblas_DUNNINGTON); - case 8: return (&gotoblas_PENRYN); - case 7: return (&gotoblas_CORE2); - case 6: return (&gotoblas_ATOM); - case 5: return (&gotoblas_BANIAS); - case 4: return (&gotoblas_PRESCOTT); - case 3: return (&gotoblas_NORTHWOOD); - case 2: return (&gotoblas_COPPERMINE); - case 1: return (&gotoblas_KATMAI); - } - return(NULL); - -} - - - - -void gotoblas_dynamic_init(void) { - - char coremsg[128]; - char coren[22]; - char *p; - - - if (gotoblas) return; - - p = getenv("OPENBLAS_CORETYPE"); - if ( p ) - { - gotoblas = force_coretype(p); - } - else - { - gotoblas = get_coretype(); - } - -#ifdef ARCH_X86 - if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; -#else - if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; - /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ - if (sizeof(void*) == 8) { - if (gotoblas == &gotoblas_KATMAI || - gotoblas == &gotoblas_COPPERMINE || - gotoblas == &gotoblas_NORTHWOOD || - gotoblas == &gotoblas_BANIAS || - gotoblas == &gotoblas_ATHLON) - gotoblas = &gotoblas_PRESCOTT; - } -#endif - - if (gotoblas && gotoblas -> init) { - strncpy(coren,gotoblas_corename(),20); - sprintf(coremsg, "Core: %s\n",coren); - openblas_warning(2, coremsg); - gotoblas -> init(); - } else { - openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); - exit(1); - } - -} - -void gotoblas_dynamic_quit(void) { - - gotoblas = NULL; - -} From 3518617f5b7118286db9ab86a85cc078c00d6046 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Dec 2019 08:32:29 +0100 Subject: [PATCH 797/935] Add Intel Goldmont+ cpuid was originally in #2228 but that PR had misplaced the file in the toplevel directory --- driver/others/dynamic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a4ff0e086..2e87e186a 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){ } return NULL; case 7: + if (model == 10) // Goldmont Plus + return &gotoblas_NEHALEM; if (model == 14) { // Ice Lake if (support_avx512()) From 3938e59569cd44634dce06c832f0db12968cd7fc Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Wed, 4 Dec 2019 00:23:46 -0600 Subject: [PATCH 798/935] AIX changes for Power8 --- kernel/Makefile.L3 | 193 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 180 insertions(+), 13 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index ed8ae406f..4decfbd20 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1,4 +1,5 @@ USE_GEMM3M = 0 +OS := $(shell uname) ifeq ($(ARCH), x86) USE_GEMM3M = 1 @@ -434,10 +435,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -445,17 +451,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -466,10 +481,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -494,16 +513,10 @@ endif endif $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) -# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s -# m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -# rm cgemm_oncopy.s cgemm_oncopy_nomacros.s $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) -# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_otcopy.s -# m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -# rm cgemm_otcopy.s cgemm_otcopy_nomacros.s ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) @@ -511,10 +524,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -530,10 +547,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -558,67 +579,107 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ +endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ +endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +endif $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -635,56 +696,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -699,100 +788,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif + else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -804,10 +958,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -931,16 +1089,17 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) -# $(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s -# m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ -# rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ +endif $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ @@ -2180,10 +2339,14 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ @@ -2222,10 +2385,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ From a4896b5538e5a3299acd6857b055e58fc3cce398 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Dec 2019 11:06:03 +0100 Subject: [PATCH 799/935] Update DYNAMIC_ARCH support for ARM64 and PPC (#2332) * Update DYNAMIC_ARCH list of ARM64 targets for gmake * Update arm64 cpu list for runtime detection * Update DYNAMIC_ARCH list of ARM64 targets for cmake and add POWERPC targets --- Makefile.arm64 | 3 ++ Makefile.system | 6 +++ cmake/arch.cmake | 6 ++- cmake/prebuild.cmake | 77 +++++++++++++++++++++++++++++++++++ driver/others/dynamic_arm64.c | 56 +++++++++++++++++++++---- 5 files changed, 138 insertions(+), 10 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 4d10ff684..c17ea7938 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif +endif + diff --git a/Makefile.system b/Makefile.system index 4cb4dc954..ab2ffca52 100644 --- a/Makefile.system +++ b/Makefile.system @@ -326,6 +326,7 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Major version > 4 @@ -547,9 +548,14 @@ endif ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 +DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 +DYNAMIC_CORE += CORTEXA72 +DYNAMIC_CORE += CORTEXA73 +DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 +DYNAMIC_CORE += TSV110 endif ifeq ($(ARCH), power) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index f3ae84fe0..8280d6274 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,11 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110) + endif () + + if (POWER) + set(DYNAMIC_CORE POWER6 POWER8 POWER9) endif () if (X86) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 086df1943..c6d109356 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -309,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "TSV110") + file(APPEND ${TARGET_CONF_TEMP} + "#define ARMV8\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "POWER6") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 8) + elseif ("${TCORE}" STREQUAL "POWER8") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 8) + elseif ("${TCORE}" STREQUAL "POWER9") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 8) endif() # Or should this actually be NUM_CORES? diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 9db9ba17d..72f5fcca2 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -43,13 +43,18 @@ #endif extern gotoblas_t gotoblas_ARMV8; +extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; +extern gotoblas_t gotoblas_CORTEXA72; +extern gotoblas_t gotoblas_CORTEXA73; +extern gotoblas_t gotoblas_FALKOR; extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; +extern gotoblas_t gotoblas_TSV110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 4 +#define NUM_CORETYPES 9 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -65,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg); static char *corename[] = { "armv8", + "cortexa53", "cortexa57", + "cortexa72", + "cortexa73", + "falkor", "thunderx", "thunderx2t99", + "tsv110", "unknown" }; char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; - if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; - if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; - if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; + if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1]; + if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2]; + if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3]; + if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4]; + if (gotoblas == &gotoblas_FALKOR) return corename[ 5]; + if (gotoblas == &gotoblas_THUNDERX) return corename[ 6]; + if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7]; + if (gotoblas == &gotoblas_TSV110) return corename[ 8]; return corename[NUM_CORETYPES]; } @@ -96,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) { switch (found) { case 0: return (&gotoblas_ARMV8); - case 1: return (&gotoblas_CORTEXA57); - case 2: return (&gotoblas_THUNDERX); - case 3: return (&gotoblas_THUNDERX2T99); + case 1: return (&gotoblas_CORTEXA53); + case 2: return (&gotoblas_CORTEXA57); + case 3: return (&gotoblas_CORTEXA72); + case 4: return (&gotoblas_CORTEXA73); + case 5: return (&gotoblas_FALKOR); + case 6: return (&gotoblas_THUNDERX); + case 7: return (&gotoblas_THUNDERX2T99); + case 8: return (&gotoblas_TSV110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -136,10 +156,14 @@ static gotoblas_t *get_coretype(void) { case 0x41: // ARM switch (part) { - case 0xd07: // Cortex A57 - case 0xd08: // Cortex A72 case 0xd03: // Cortex A53 + return &gotoblas_CORTEXA53; + case 0xd07: // Cortex A57 return &gotoblas_CORTEXA57; + case 0xd08: // Cortex A72 + return &gotoblas_CORTEXA72; + case 0xd09: // Cortex A73 + return &gotoblas_CORTEXA73; } break; case 0x42: // Broadcom @@ -158,6 +182,20 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX2T99; } break; + case 0x48: // HiSilicon + switch (part) + { + case 0xd01: // tsv110 + return &gotoblas_TSV110; + } + break; + case 0x51: // Qualcomm + switch (part) + { + case 0xc00: // Falkor + return &gotoblas_FALKOR; + } + break; } return NULL; } From 6baa9b07d7e88f93ef42db4e96fa3d2be035c3d4 Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Fri, 6 Dec 2019 04:33:32 -0600 Subject: [PATCH 800/935] AIX changes for Power8 --- common_power.h | 8 ++++---- kernel/Makefile.L3 | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common_power.h b/common_power.h index 76b9f0f32..9df398266 100644 --- a/common_power.h +++ b/common_power.h @@ -60,10 +60,10 @@ #define XXSWAPD(T,A) xxswapd T, A #define XVMOVDP(T,A) xvmovdp T, A -#define XXSPLTD_S(T,A,z) "xxspltd T, A, z \n\t" -#define XXMRGHD_S(T,A,B) "xxmrghd T, A, B \n\t" -#define XXMRGLD_S(T,A,B) "xxmrgld T, A, B \n\t" -#define XXSWAPD_S(T,A) "xxswapd T, A" +#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t" +#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t" +#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t" +#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t" #endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4decfbd20..c36a44f20 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1098,7 +1098,7 @@ ifeq ($(OS), AIX) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s else - $(CC) $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ endif $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) From b28db31429d9b3b6a57a182d79e63aafdd2843f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 6 Dec 2019 21:23:56 +0100 Subject: [PATCH 801/935] Support two-digit version numbers in gcc version check fixes #2336 (non-recognition of gcc 10) with patch provided by JeffreyALaw. --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index 993ad9a35..79b24e2dc 100644 --- a/f_check +++ b/f_check @@ -71,7 +71,7 @@ if ($compiler eq "") { if ($data =~ /GNU/) { - $data =~ /(\d)\.(\d).(\d)/; + $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; $minor = $2; From 13226e310195c7dd5e051c791c4f0839f2f606c4 Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 11 Dec 2019 17:51:42 +0100 Subject: [PATCH 802/935] driver: more reasonable thread wait timeout on Windows. It used to be 5ms, which might not be long enough in some cases for the thread to exit well, but then when set to 5000 (5s), it would slow down any program depending on OpenBlas. Let's just set it to 50ms, which is at least 10 times longer than originally, but still reasonable in case of failed thread termination. --- driver/others/blas_server_win32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index e27725baf..5ecc4428b 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -462,7 +462,7 @@ int BLASFUNC(blas_thread_shutdown)(void){ for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects - DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000); + DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP From aeef942c4f2a17099d82307d482abdec53bd3fbd Mon Sep 17 00:00:00 2001 From: w00421467 Date: Tue, 17 Dec 2019 10:00:13 +0800 Subject: [PATCH 803/935] use arm neon instructions to optimize gemm beta operation --- kernel/arm64/KERNEL | 2 +- kernel/arm64/dgemm_beta.S | 178 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 kernel/arm64/dgemm_beta.S diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index f936cdf47..440257196 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -34,7 +34,7 @@ ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA -DGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../arm64/dgemm_beta.S endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S new file mode 100644 index 000000000..636954695 --- /dev/null +++ b/kernel/arm64/dgemm_beta.S @@ -0,0 +1,178 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define BETA d0 +#define LDC x6 +#define C00 x7 + +#define A01 x8 +#define A02 x9 +#define A03 x10 +#define A04 x11 + +#define beta0 d11 +#define betaV0 v11.d[0] +#define I x16 + +#define size 128 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr LDC, [sp] + SAVE_REGS + +.Lgemm_beta_BEGIN: + + fmov beta0, BETA + cmp N, #0 + ble .Lgemm_beta_L999 + +.Lgemm_beta_01: + + lsl LDC, LDC, #3 + + .align 5 +.Lgemm_beta_02: + + mov A01, C00 + add C00, C00, LDC + asr I, M, #4 + cmp I, #0 + ble .Lgemm_beta_04 + add A02, A01, #32 + add A03, A02, #32 + add A04, A03, #32 + + .align 5 +.Lgemm_beta_03: + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + ldp q4, q5, [A03] + ldp q6, q7, [A04] + + fmul v0.2d, v0.2d, betaV0 + fmul v1.2d, v1.2d, betaV0 + + fmul v2.2d, v2.2d, betaV0 + fmul v3.2d, v3.2d, betaV0 + + fmul v4.2d, v4.2d, betaV0 + fmul v5.2d, v5.2d, betaV0 + + fmul v6.2d, v6.2d, betaV0 + fmul v7.2d, v7.2d, betaV0 + + st1 {v0.2d, v1.2d}, [A01] + add A01, A01, size + st1 {v2.2d, v3.2d}, [A02] + add A02, A02, size + st1 {v4.2d, v5.2d}, [A03] + add A03, A03, size + st1 {v6.2d, v7.2d}, [A04] + add A04, A04, size + + subs I , I , #1 + bne .Lgemm_beta_03 + + .align 5 +.Lgemm_beta_04: + + and I, M , #15 // M%16 + cmp I, #0 + ble .Lgemm_beta_06 + + .align 5 +.Lgemm_beta_05: + + ldr d12, [A01] + fmul d12, d12, beta0 + str d12, [A01] + add A01, A01, #8 + + subs I , I , #1 + bne .Lgemm_beta_05 + + .align 5 +.Lgemm_beta_06: + + subs N , N, #1 // N-- + bne .Lgemm_beta_02 + + .align 5 +.Lgemm_beta_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE From b7cc69ee622fed9039ab755b87eee9279d27d541 Mon Sep 17 00:00:00 2001 From: w00421467 Date: Fri, 20 Dec 2019 10:11:50 +0800 Subject: [PATCH 804/935] declare DGEMM_BETA in KERNEL.ARMV8 rather than the generic KERNEL --- kernel/arm64/KERNEL | 2 +- kernel/arm64/KERNEL.ARMV8 | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index 440257196..f936cdf47 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -34,7 +34,7 @@ ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA -DGEMM_BETA = ../arm64/dgemm_beta.S +DGEMM_BETA = ../generic/gemm_beta.c endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index efc1ec8bc..b90dd228b 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -102,6 +102,8 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S +DGEMM_BETA = dgemm_beta.S + SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) From d573d24de7cda411edbf0675c7c2e2dd8cdb896f Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 21 Dec 2019 14:35:15 +0800 Subject: [PATCH 805/935] Fast Haswell ZGEMM kernel --- kernel/x86_64/zgemm_kernel_4x2_haswell.c | 240 +++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 kernel/x86_64/zgemm_kernel_4x2_haswell.c diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c new file mode 100644 index 000000000..3279b8b8c --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c @@ -0,0 +1,240 @@ +#include "common.h" +#include + +/* recommended settings: GEMM_P = 192, GEMM_Q = 192 */ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define A_CONJ 0 + #define B_CONJ 0 +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define A_CONJ 1 + #define B_CONJ 0 +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define A_CONJ 0 + #define B_CONJ 1 +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define A_CONJ 1 + #define B_CONJ 1 +#endif + +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ +/* r11 = m, r12 = k << 5, r13 = k, r14 = b_head, r15 = temp */ + +/* m=4, ymm 0-3 temp, ymm 4-15 acc */ +#if A_CONJ == B_CONJ + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmaddsub231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#else + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmsubadd231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#endif +/* expanded accumulators for m4n1 and m4n2 */ +#define KERNEL_k1m4n1 \ + "vbroadcastf128 (%1),%%ymm0; addq $16,%1;"\ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;" acc_m2n1_exp(1,2,0,4,5)\ + "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2;" acc_m2n1_exp(1,2,0,6,7)\ + "addq $64,%0;" +#define KERNEL_k1m4n2 \ + "vbroadcastf128 (%1),%%ymm0; vbroadcastf128 16(%1),%%ymm1; addq $32,%1;"\ + "vmovddup (%0),%%ymm2; vmovddup 8(%0),%%ymm3;" acc_m2n1_exp(2,3,0,4,5) acc_m2n1_exp(2,3,1,8,9)\ + "vmovddup 32(%0),%%ymm2; vmovddup 40(%0),%%ymm3;" acc_m2n1_exp(2,3,0,6,7) acc_m2n1_exp(2,3,1,10,11)\ + "addq $64,%0;" +/* contracted accumulators for m4n4 and m4n6 */ +#define acc_m4n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ + "vbroadcastsd "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m4n1_con(ua,la,2,luc,llc)\ + "vbroadcastsd "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m4n1_con(ua,la,3,ruc,rlc) +#define KERNEL_1_k1m4n4 \ + "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ + acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1) +#define KERNEL_2_k1m4n4 \ + "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\ + acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1) +#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2) +#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2) +#define KERNEL_k1m4n4 KERNEL_1_k1m4n4 KERNEL_2_k1m4n4 "addq $32,%1;" +#define KERNEL_k1m4n6 KERNEL_1_k1m4n6 KERNEL_2_k1m4n6 "addq $32,%1;" +#define zero_4ymm(no1,no2,no3,no4) \ + "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ + "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" +/* initialization and storage macros */ +#define INIT_m4n1 zero_4ymm(4,5,6,7) +#define INIT_m4n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m4n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) +#if A_CONJ == B_CONJ + #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" +#else + #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213pd "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ + "vfmsubadd231pd %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovupd %%ymm"#c","#off"("#__VA_ARGS__");" +#else + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213pd "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ + "vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovupd %%ymm"#tmp","#off"("#__VA_ARGS__");" +#endif +#define save_init_m4 "movq %2,%3; addq $64,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) +#define SAVE_m4n2 SAVE_m4n1\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) +#define SAVE_m4n4 save_init_m4\ + save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ + save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) +#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ + save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) +#define COMPUTE_m4(ndim) \ + "movq %%r14,%1;" INIT_m4n##ndim "movq %2,%3; movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"4443f; cmpq $10,%5; jb "#ndim"4442f;"\ + "movq $10,%5; movq $84,%%r15;"\ + #ndim"4441:\n\t"\ + "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ + "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ + "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\ + "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\ + #ndim"4442:\n\t"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_k1m4n##ndim "decq %5; jnz "#ndim"4442b;"\ + #ndim"4443:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m4n##ndim + +/* m=2, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ +#define KERNEL_k1m2n1 \ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\ + "vbroadcastf128 (%1),%%ymm0;" acc_m2n1_exp(1,2,0,4,5) "addq $16,%1;" +#define acc_m2n2_exp(c1l,c1r,c2l,c2r,...) \ + "vbroadcastf128 ("#__VA_ARGS__"),%%ymm2;" acc_m2n1_exp(0,1,2,c1l,c1r)\ + "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3;" acc_m2n1_exp(0,1,3,c2l,c2r) +#define KERNEL_h_k1m2n2 \ + "vmovddup (%0),%%ymm0; vmovddup 8(%0),%%ymm1; addq $32,%0;" acc_m2n2_exp(4,5,6,7,%1) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $32,%1;" +#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m2n2 zero_4ymm(4,5,6,7) +#define INIT_m2n4 INIT_m2n2 zero_4ymm(8,9,10,11) +#define INIT_m2n6 INIT_m2n4 zero_4ymm(12,13,14,15) +#define save_init_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m2n1 save_init_m2 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) +#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) +#define SAVE_m2n4 SAVE_m2n2 "leaq (%3,%4,2),%3;"\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) +#define SAVE_m2n6 SAVE_m2n4 "leaq (%3,%4,2),%3;"\ + cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) +#define COMPUTE_m2(ndim) \ + "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"2222f;"\ + #ndim"2221:\n\t"\ + KERNEL_k1m2n##ndim\ + "decq %5; jnz "#ndim"2221b;"\ + #ndim"2222:\n\t"\ + SAVE_m2n##ndim + +/* m=1, vmm 0-3 temp, vmm 4-15 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" +#else + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfnmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" +#endif +#define KERNEL_k1m1n1 \ + "vmovddup (%0),%%xmm0; vmovddup 8(%0),%%xmm1; addq $16,%0;"\ + "vmovupd (%1),%%xmm2; addq $16,%1;" acc_m1n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n2 \ + "vbroadcastsd (%0),%%ymm0; vbroadcastsd 8(%0),%%ymm1; addq $16,%0;"\ + "vmovupd (%1),%%ymm2;" acc_m1n2_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovupd (%1,%%r12,1),%%ymm2;" acc_m1n2_exp(0,1,2,6,7) +#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovupd (%1,%%r12,2),%%ymm2;" acc_m1n2_exp(0,1,2,8,9) +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $32,%1;" +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n2 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;" +#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;" +#if A_CONJ == B_CONJ + #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" +#else + #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" +#endif +#if A_CONJ == 0 + #define save_m1n1(c,tmp,alpr,alpi) \ + "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213pd (%3),%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231pd %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovupd %%xmm"#c",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ + "vfmsubadd213pd %%ymm"#tmp2",%%ymm"#alpr",%%ymm"#c"; vfmsubadd231pd %%ymm"#tmp1",%%ymm"#alpi",%%ymm"#c";"\ + "vmovupd %%xmm"#c",(%3); vextractf128 $1,%%ymm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define save_m1n1(c,tmp,alpr,alpi) \ + "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213pd (%3),%%xmm"#alpi",%%xmm"#tmp";"\ + "vfmaddsub231pd %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovupd %%xmm"#tmp",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ + "vfmaddsub213pd %%ymm"#tmp2",%%ymm"#alpi",%%ymm"#tmp1"; vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp1";"\ + "vmovupd %%xmm"#tmp1",(%3); vextractf128 $1,%%ymm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define save_init_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,0,1) +#define SAVE_m1n2 save_init_m1 cont_expacc(4,5,4) save_m1n2(4,2,3,0,1) +#define SAVE_m1n4 SAVE_m1n2 cont_expacc(6,7,6) save_m1n2(6,2,3,0,1) +#define SAVE_m1n6 SAVE_m1n4 cont_expacc(8,9,8) save_m1n2(8,2,3,0,1) +#define COMPUTE_m1(ndim) \ + "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"1112f;"\ + #ndim"1111:\n\t"\ + KERNEL_k1m1n##ndim\ + "decq %5; jnz "#ndim"1111b;"\ + #ndim"1112:\n\t"\ + SAVE_m1n##ndim + +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K *2;\ + __asm__ __volatile__ (\ + "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $5,%%r12; movq %7,%%r11;"\ + "cmpq $4,%7; jb "#ndim"9992f;"\ + #ndim"9991:\n\t"\ + COMPUTE_m4(ndim)\ + "subq $4,%7; cmpq $4,%7; jnb "#ndim"9991b;"\ + #ndim"9992:\n\t"\ + "cmpq $2,%7; jb "#ndim"9993f;"\ + COMPUTE_m2(ndim) "subq $2,%7;"\ + #ndim"9993:\n\t"\ + "testq %7,%7; jz "#ndim"9994f;"\ + COMPUTE_m1(ndim)\ + #ndim"9994:\n\t"\ + "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ + ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ + "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; +#if A_CONJ == B_CONJ + double const_val[2] = {-alphar, -alphai}; +#else + double const_val[2] = {alphar, alphai}; +#endif + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From f41d52665d589440dd5227b52025ea492bea4c6e Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 21 Dec 2019 14:37:06 +0800 Subject: [PATCH 806/935] Fast Haswell ZGEMM kernel --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index f98728a41..5c11ced1d 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -67,7 +67,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S -ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c From 105e26e12ac2283ec2bee50d03d02d77a2c92780 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 21 Dec 2019 14:38:51 +0800 Subject: [PATCH 807/935] Adjust Haswell ZGEMM blocking parameters --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index d39fc4a1d..5fb0868b2 100644 --- a/param.h +++ b/param.h @@ -1572,7 +1572,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 @@ -1582,7 +1582,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 256 #endif #define CGEMM_DEFAULT_Q 192 -#define ZGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 From 025741f16aeaafe0080b9065dbf2315762b286e4 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:40:03 +0800 Subject: [PATCH 808/935] Fast Haswell CGEMM kernel --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 287 +++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 kernel/x86_64/cgemm_kernel_8x2_haswell.c diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c new file mode 100644 index 000000000..49fef90db --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -0,0 +1,287 @@ +#include "common.h" +#include + +/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define A_CONJ 0 + #define B_CONJ 0 +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define A_CONJ 1 + #define B_CONJ 0 +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define A_CONJ 0 + #define B_CONJ 1 +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define A_CONJ 1 + #define B_CONJ 1 +#endif + +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ +/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */ + +/* m=8, ymm 0-3 temp, ymm 4-15 acc */ +#if A_CONJ == B_CONJ + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#else + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#endif +/* expanded accumulators for m8n1 and m8n2 */ +#define KERNEL_k1m8n1 \ + "vbroadcastsd (%1),%%ymm0; addq $8,%1;"\ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\ + "addq $64,%0;" +#define KERNEL_k1m8n2 \ + "vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\ + "vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\ + "vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\ + "addq $64,%0;" +/* contracted accumulators for m8n4 and m8n6 */ +#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ + "vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\ + "vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc) +#define KERNEL_1_k1m8n4 \ + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ + acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1) +#define KERNEL_2_k1m8n4 \ + "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\ + acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1) +#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2) +#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2) +#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;" +#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;" +#define zero_4ymm(no1,no2,no3,no4) \ + "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ + "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" +/* initialization and storage macros */ +#define INIT_m8n1 zero_4ymm(4,5,6,7) +#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15) +#if A_CONJ == B_CONJ + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" +#else + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ + "vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");" +#else + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ + "vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");" +#endif +#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" +#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) +#define SAVE_m8n2 SAVE_m8n1\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) +#define SAVE_m8n4 save_init_m8\ + save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ + save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) +#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\ + save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) +#define COMPUTE_m8(ndim) \ + "movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\ + "movq $10,%5; movq $84,%%r15;"\ + #ndim"8881:\n\t"\ + "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\ + "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\ + #ndim"8882:\n\t"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ + #ndim"8883:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim +/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ +#define KERNEL_k1m4n1 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;" +#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\ + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r) +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1) +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m4n2 zero_4ymm(4,5,6,7) +#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11) +#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) +#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" +#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) +#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) +#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) +#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ + cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) +#define COMPUTE_m4(ndim) \ + "movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"4442f;"\ + #ndim"4441:\n\t"\ + KERNEL_k1m4n##ndim\ + "decq %5; jnz "#ndim"4441b;"\ + #ndim"4442:\n\t"\ + SAVE_m4n##ndim +/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" +#else + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" +#endif +#define KERNEL_h_k1m2n1 \ + "vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\ + "vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\ + "vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7) +#define acc_m2n2_exp(c1,c2,c3,c4,...)\ + "vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;" +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;" +#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";" +#define INIT_m2n1 zero_2xmm(4,5) +#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7) +#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11) +#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15) +#if A_CONJ == B_CONJ + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" +#else + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1xmm(c,tmp,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;" +#else + #define save_1xmm(c,tmp,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\ + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;" +#endif +#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" +#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1) +#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1) +#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1) +#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1) +#define COMPUTE_m2(ndim) \ + "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"2222f;"\ + #ndim"2221:\n\t"\ + KERNEL_k1m2n##ndim\ + "decq %5; jnz "#ndim"2221b;"\ + #ndim"2222:\n\t"\ + SAVE_m2n##ndim +/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" +#else + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" +#endif +#define KERNEL_k1m1n1 \ + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ + "vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n2 \ + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ + "vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7) +#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9) +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;" +#define INIT_m1n1 zero_2xmm(4,5) +#define INIT_m1n2 zero_2xmm(4,5) +#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7) +#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9) +#if A_CONJ == 0 + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ + "vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\ + "vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\ + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ + "vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\ + "vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" +#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1) +#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1) +#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1) +#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1) +#define COMPUTE_m1(ndim) \ + "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"1112f;"\ + #ndim"1111:\n\t"\ + KERNEL_k1m1n##ndim\ + "decq %5; jnz "#ndim"1111b;"\ + #ndim"1112:\n\t"\ + SAVE_m1n##ndim +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K *2;\ + __asm__ __volatile__ (\ + "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\ + "cmpq $8,%7; jb "#ndim"9992f;"\ + #ndim"9991:\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\ + #ndim"9992:\n\t"\ + "cmpq $4,%7; jb "#ndim"9993f;"\ + COMPUTE_m4(ndim) "subq $4,%7;"\ + #ndim"9993:\n\t"\ + "cmpq $2,%7; jb "#ndim"9994f;"\ + COMPUTE_m2(ndim) "subq $2,%7;"\ + #ndim"9994:\n\t"\ + "testq %7,%7; jz "#ndim"9995f;"\ + COMPUTE_m1(ndim)\ + #ndim"9995:\n\t"\ + "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ + ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ + "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ +} +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; +#if A_CONJ == B_CONJ + float const_val[2] = {-alphar, -alphai}; +#else + float const_val[2] = {alphar, alphai}; +#endif + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From c418c81224b56e2a99b5f3e7a159b30bfd8f8d8b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:41:44 +0800 Subject: [PATCH 809/935] Update KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 5c11ced1d..9bd34f1e3 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -56,7 +56,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S -CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMKERNEL = cgemm_kernel_8x2_haswell.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c From 2cd9306bb5138f8ec796964fa578b2ea1b73e921 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:42:30 +0800 Subject: [PATCH 810/935] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index be4503d47..aa4ba4834 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -53,7 +53,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S -CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMKERNEL = cgemm_kernel_8x2_haswell.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -64,7 +64,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S -ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c From 611445c7f8d136ce66bb8a825b3383fc8eb028bd Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:44:55 +0800 Subject: [PATCH 811/935] Update param.h --- param.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/param.h b/param.h index 5fb0868b2..d80bbf4f2 100644 --- a/param.h +++ b/param.h @@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 -#define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 @@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif -#define CGEMM_DEFAULT_Q 192 -#define ZGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 @@ -1571,7 +1571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 -#define CGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI @@ -1581,7 +1581,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif -#define CGEMM_DEFAULT_Q 192 +#define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r From 6fbe51072bed086b71d18ed77ee7b8cc79e63dd6 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:24:40 +0800 Subject: [PATCH 812/935] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3859a9c19..99f82df9d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -171,3 +171,9 @@ In chronological order: * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes * [2019-03-14] power9 dgemm/dtrmm kernel * [2019-04-29] power9 sgemm/strmm kernel + +* Jiachen Wang + * [2018.07] optimize AVX2 DGEMM + * [2018-11] optimize AVX512 SGEMM and DGEMM + * [2018-11] AVX512 CGEMM & ZGEMM kernels + * [2018-12] optimize AVX2 CGEMM and ZGEMM From 3ce6bcdb5f61fad716703b9facf26087aade7ae2 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:30:16 +0800 Subject: [PATCH 813/935] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 99f82df9d..6d30ee942 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -173,7 +173,8 @@ In chronological order: * [2019-04-29] power9 sgemm/strmm kernel * Jiachen Wang - * [2018.07] optimize AVX2 DGEMM - * [2018-11] optimize AVX512 SGEMM and DGEMM - * [2018-11] AVX512 CGEMM & ZGEMM kernels - * [2018-12] optimize AVX2 CGEMM and ZGEMM + * [2019-07-29] optimize AVX2 DGEMM + * [2019-10-20] AVX512 DGEMM kernel (4x8) + * [2019-11-06] optimize AVX512 SGEMM + * [2019-11-12] AVX512 CGEMM & ZGEMM kernels + * [2019-12-23] optimize AVX2 CGEMM and ZGEMM From eeecd623d85e90c75172b610e4ecb11f4c04650e Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:40:16 +0800 Subject: [PATCH 814/935] Update cgemm_kernel_8x2_haswell.c --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c index 49fef90db..eab8c9ea5 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.c +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -104,6 +104,7 @@ KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ #ndim"8883:\n\t"\ "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim + /* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ #define KERNEL_k1m4n1 \ "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ @@ -137,6 +138,7 @@ "decq %5; jnz "#ndim"4441b;"\ #ndim"4442:\n\t"\ SAVE_m4n##ndim + /* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ #if A_CONJ == B_CONJ #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" @@ -189,6 +191,7 @@ "decq %5; jnz "#ndim"2221b;"\ #ndim"2222:\n\t"\ SAVE_m2n##ndim + /* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ #if A_CONJ == B_CONJ #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" @@ -242,6 +245,7 @@ "decq %5; jnz "#ndim"1111b;"\ #ndim"1112:\n\t"\ SAVE_m1n##ndim + #define COMPUTE(ndim) {\ b_pref = b_ptr + ndim * K *2;\ __asm__ __volatile__ (\ @@ -266,6 +270,7 @@ "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ } + int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) { From 5fd1edead95b86df0e92fd2be1e0435d746af56d Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:00:55 +0800 Subject: [PATCH 815/935] Create cgemm3m_kernel_8x4_haswell.c --- kernel/x86_64/cgemm3m_kernel_8x4_haswell.c | 279 +++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100644 kernel/x86_64/cgemm3m_kernel_8x4_haswell.c diff --git a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c new file mode 100644 index 000000000..831f25483 --- /dev/null +++ b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c @@ -0,0 +1,279 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ +/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */ + +#include "common.h" +#include + +//recommended settings: GEMM_P = 320, GEMM_Q = 320. + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_m8n1 \ + "vunpcklps %%ymm4,%%ymm4,%%ymm2; vunpckhps %%ymm4,%%ymm4,%%ymm3;"\ + "vperm2f128 $2,%%ymm2,%%ymm3,%%ymm1; vperm2f128 $19,%%ymm2,%%ymm3,%%ymm2;"\ + "vfmadd213ps (%2),%%ymm0,%%ymm1; vfmadd213ps 32(%2),%%ymm0,%%ymm2; vmovups %%ymm1,(%2); vmovups %%ymm2,32(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklpd "#c2","#c1",%%ymm2; vunpckhpd "#c2","#c1",%%ymm3;"\ + "vperm2f128 $2,%%ymm2,%%ymm3,"#c1"; vperm2f128 $19,%%ymm2,%%ymm3,"#c2";"\ + "vmovsldup "#c1",%%ymm2; vmovsldup "#c2",%%ymm3;"\ + "vfmadd213ps (%5),%%ymm0,%%ymm2; vfmadd213ps 32(%5),%%ymm0,%%ymm3; vmovups %%ymm2,(%5); vmovups %%ymm3,32(%5);"\ + "vmovshdup "#c1",%%ymm2; vmovshdup "#c2",%%ymm3;"\ + "vfmadd213ps (%5,%3,1),%%ymm0,%%ymm2; vfmadd213ps 32(%5,%3,1),%%ymm0,%%ymm3; vmovups %%ymm2,(%5,%3,1); vmovups %%ymm3,32(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define COMPUTE_m8(ndim) \ + INIT_m8n##ndim\ + "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\ + "cmpq $24,%4; jb "#ndim"882f;"\ + #ndim"881:\n\t"\ + "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht1 (%8); addq $16,%8;"\ + "subq $8,%4; cmpq $24,%4; jnb "#ndim"881b;"\ + "movq %2,%5;"\ + #ndim"882:\n\t"\ + "testq %4,%4; jz "#ndim"883f;"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + KERNEL_k1m8n##ndim\ + "decq %4; jmp "#ndim"882b;"\ + #ndim"883:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ + SAVE_m8n##ndim "addq $64,%2;" + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,1) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,2) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm2; vunpckhps %%xmm4,%%xmm4,%%xmm3;"\ + "vfmadd213ps (%2),%%xmm0,%%xmm2; vfmadd213ps 16(%2),%%xmm0,%%xmm3; vmovups %%xmm2,(%2); vmovups %%xmm3,16(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklpd "#c2","#c1",%%xmm2; vunpckhpd "#c2","#c1","#c2"; vmovapd %%xmm2,"#c1";"\ + "vmovsldup "#c1",%%xmm2; vmovsldup "#c2",%%xmm3;"\ + "vfmadd213ps (%5),%%xmm0,%%xmm2; vfmadd213ps 16(%5),%%xmm0,%%xmm3; vmovups %%xmm2,(%5); vmovups %%xmm3,16(%5);"\ + "vmovshdup "#c1",%%xmm2; vmovshdup "#c2",%%xmm3;"\ + "vfmadd213ps (%5,%3,1),%%xmm0,%%xmm2; vfmadd213ps 16(%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm2,(%5,%3,1); vmovups %%xmm3,16(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) +#define COMPUTE_m4(ndim) \ + INIT_m4n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"442:\n\t"\ + "testq %4,%4; jz "#ndim"443f;"\ + KERNEL_k1m4n##ndim\ + "decq %4; jmp "#ndim"442b;"\ + #ndim"443:\n\t"\ + SAVE_m4n##ndim "addq $32,%2;" + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_m2n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm1; vfmadd213ps (%2),%%xmm0,%%xmm1; vmovups %%xmm1,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_m2n2 SAVE_m2n1 \ + "vunpcklps %%xmm5,%%xmm5,%%xmm1; vfmadd213ps (%2,%3,1),%%xmm0,%%xmm1; vmovups %%xmm1,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklpd "#c2","#c1",%%xmm1; vunpckhpd "#c2","#c1",%%xmm2;"\ + "vmovsldup %%xmm1,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ + "vmovshdup %%xmm1,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;"\ + "vmovsldup %%xmm2,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ + "vmovshdup %%xmm2,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define COMPUTE_m2(ndim) \ + INIT_m2n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"222:\n\t"\ + "testq %4,%4; jz "#ndim"223f;"\ + KERNEL_k1m2n##ndim\ + "decq %4; jmp "#ndim"222b;"\ + #ndim"223:\n\t"\ + SAVE_m2n##ndim "addq $16,%2;" + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n2 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm3; vmovhpd (%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovsd %%xmm4,(%2); vmovhpd %%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vunpcklps "#c1","#c1",%%xmm1; vunpckhps "#c1","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) +#define COMPUTE_m1(ndim) \ + INIT_m1n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"112:\n\t"\ + "testq %4,%4; jz "#ndim"113f;"\ + KERNEL_k1m1n##ndim\ + "decq %4; jmp "#ndim"112b;"\ + #ndim"113:\n\t"\ + SAVE_m1n##ndim "addq $8,%2;" + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ +/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */ +/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const),r15 = tmp */ + +#define COMPUTE(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%6),%%ymm0;"\ + "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $8,%7;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%7;jb 33103"#ndim"f;"\ + COMPUTE_m4(ndim)\ + "subq $4,%7;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%7;jb 33104"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%7;"\ + "33104"#ndim":\n\t"\ + "testq %7,%7;jz 33105"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ + ::"r11","r12","r13","r14","r15"\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; + float constval[2]; constval[0] = alphar; constval[1] = alphai; + float *const_val=constval; + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From ed9af2f7dae61a23a18aab11025e1b4e586f5a51 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:01:38 +0800 Subject: [PATCH 816/935] Update KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9bd34f1e3..bdebd22b9 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -97,6 +97,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S From 4c35b8dbaacfe23f76c48517674da8cf01cd2828 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:03:01 +0800 Subject: [PATCH 817/935] Update gemm3m_level3.c --- driver/level3/gemm3m_level3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c index bbde7e5d1..d037e72cd 100644 --- a/driver/level3/gemm3m_level3.c +++ b/driver/level3/gemm3m_level3.c @@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); From 3a66c8cac18dbbc172fc703feab22f53755a52c9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:04:08 +0800 Subject: [PATCH 818/935] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index aa4ba4834..025db515e 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -94,6 +94,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S From 64639f440f7e9cf630100e4b03999e9321018876 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:06:42 +0800 Subject: [PATCH 819/935] Update param.h --- param.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/param.h b/param.h index d80bbf4f2..4084c781d 100644 --- a/param.h +++ b/param.h @@ -693,15 +693,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 -#define CGEMM3M_DEFAULT_P 448 +#define CGEMM3M_DEFAULT_P 320 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 @@ -1596,15 +1596,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 -#define CGEMM3M_DEFAULT_P 448 +#define CGEMM3M_DEFAULT_P 320 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 From cd765f094b52bc010091f4782e232706a854ea90 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:23:29 +0800 Subject: [PATCH 820/935] Update cgemm3m_kernel_8x4_haswell.c --- kernel/x86_64/cgemm3m_kernel_8x4_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c index 831f25483..01fbf3064 100644 --- a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c +++ b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c @@ -255,7 +255,7 @@ "33105"#ndim":\n\t"\ "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ - ::"r11","r12","r13","r14","r15"\ + ::"r11","r12","r13","r14","r15",\ "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ } From 312060d0d6b720eeb9bafbddaeedf3ba968a3732 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 23:36:13 +0800 Subject: [PATCH 821/935] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6d30ee942..fd759913d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -178,3 +178,4 @@ In chronological order: * [2019-11-06] optimize AVX512 SGEMM * [2019-11-12] AVX512 CGEMM & ZGEMM kernels * [2019-12-23] optimize AVX2 CGEMM and ZGEMM + * [2019-12-27] AVX2 CGEMM3M kernel From 454847588e700991a24b93d71665d3386385024a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Dec 2019 21:27:18 +0100 Subject: [PATCH 822/935] Update LAPACK to 3.9.0 --- lapack-netlib/SRC/VARIANTS/Makefile | 22 +++++++++++----------- lapack-netlib/SRC/VARIANTS/README | 12 ++++++------ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 9f1410755..25d8ee175 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -1,5 +1,3 @@ -include ../../make.inc - ####################################################################### # This is the makefile to create a the variants libraries for LAPACK. # The files are organized as follows: @@ -17,6 +15,9 @@ include ../../make.inc # 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 ####################################################################### +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc + CHOLRL = cholesky/RL/cpotrf.o cholesky/RL/dpotrf.o cholesky/RL/spotrf.o cholesky/RL/zpotrf.o CHOLTOP = cholesky/TOP/cpotrf.o cholesky/TOP/dpotrf.o cholesky/TOP/spotrf.o cholesky/TOP/zpotrf.o @@ -30,37 +31,36 @@ LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o +.PHONY: all all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a cholrl.a: $(CHOLRL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ choltop.a: $(CHOLTOP) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lucr.a: $(LUCR) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lull.a: $(LULL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lurec.a: $(LUREC) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ qrll.a: $(QRLL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ +.PHONY: clean cleanobj cleanlib clean: cleanobj cleanlib cleanobj: rm -f $(CHOLRL) $(CHOLTOP) $(LUCR) $(LULL) $(LUREC) $(QRLL) cleanlib: rm -f *.a - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< diff --git a/lapack-netlib/SRC/VARIANTS/README b/lapack-netlib/SRC/VARIANTS/README index 4d301cc6e..ef7626deb 100644 --- a/lapack-netlib/SRC/VARIANTS/README +++ b/lapack-netlib/SRC/VARIANTS/README @@ -34,7 +34,7 @@ References:For a more detailed description please refer to ========= These variants are compiled by default in the build process but they are not tested by default. -The build process creates one new library per variants in the four arithmetics (single real/double real/single complex/double complex). +The build process creates one new library per variants in the four arithmetic (single real/double real/single complex/double complex). The libraries are in the SRC/VARIANTS directory. Corresponding libraries created in SRC/VARIANTS: @@ -64,16 +64,16 @@ You should then see the following files in the TESTING directory: = LINKING YOUR PROGRAM = ======================== -You just need to add the variants methods library in your linking sequence before your lapack libary. +You just need to add the variants methods library in your linking sequence before your lapack library. Here is a quick example for LU Default using LU Right Looking version: - $(FORTRAN) -c myprog.f - $(FORTRAN) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) -c myprog.f + $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB) Using LU Left Looking version: - $(FORTRAN) -c myprog.f - $(FORTRAN) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) -c myprog.f + $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB) =========== = SUPPORT = From 3ccf8885acbb3a536c67fa8a23b54ba000507a21 Mon Sep 17 00:00:00 2001 From: w00421467 Date: Mon, 30 Dec 2019 11:45:49 +0800 Subject: [PATCH 823/935] prefetching for dgemm_beta --- kernel/arm64/dgemm_beta.S | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S index 636954695..1ce452212 100644 --- a/kernel/arm64/dgemm_beta.S +++ b/kernel/arm64/dgemm_beta.S @@ -43,7 +43,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define betaV0 v11.d[0] #define I x16 -#define size 128 +#define prfm_size 640 +#define calc_size 128 /************************************************************************************** * Macro definitions @@ -119,27 +120,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp q2, q3, [A02] ldp q4, q5, [A03] ldp q6, q7, [A04] - + fmul v0.2d, v0.2d, betaV0 fmul v1.2d, v1.2d, betaV0 - + fmul v2.2d, v2.2d, betaV0 fmul v3.2d, v3.2d, betaV0 - + + prfm PLDL1KEEP, [A01, prfm_size] + fmul v4.2d, v4.2d, betaV0 fmul v5.2d, v5.2d, betaV0 - + + prfm PLDL1KEEP, [A03, prfm_size] + fmul v6.2d, v6.2d, betaV0 fmul v7.2d, v7.2d, betaV0 st1 {v0.2d, v1.2d}, [A01] - add A01, A01, size + add A01, A01, calc_size st1 {v2.2d, v3.2d}, [A02] - add A02, A02, size + add A02, A02, calc_size st1 {v4.2d, v5.2d}, [A03] - add A03, A03, size + add A03, A03, calc_size st1 {v6.2d, v7.2d}, [A04] - add A04, A04, size + add A04, A04, calc_size subs I , I , #1 bne .Lgemm_beta_03 From ae1579be13bb864309c494d64a0696fbbc79d819 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 16:02:51 +0800 Subject: [PATCH 824/935] Create zgemm3m_kernel_4x4_haswell.c --- kernel/x86_64/zgemm3m_kernel_4x4_haswell.c | 212 +++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 kernel/x86_64/zgemm3m_kernel_4x4_haswell.c diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c new file mode 100644 index 000000000..7b5b835c8 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c @@ -0,0 +1,212 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ +/* r12 = k << 5(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */ + +#include "common.h" +#include + +//recommended settings: GEMM_Q=256, GEMM_P=256 + +/* m = 4 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovupd (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm2; vfmadd231pd %%ymm1,%%ymm2,%%ymm4;"\ + "addq $8,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\ + "vbroadcastf128 (%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm4; vfmadd231pd %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vbroadcastf128 16(%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm6; vfmadd231pd %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vbroadcastf128 ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%1;" +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_h_m4n1 \ + "vpermpd $216,%%ymm4,%%ymm3; vunpcklpd %%ymm3,%%ymm3,%%ymm1; vunpckhpd %%ymm3,%%ymm3,%%ymm2;"\ + "vfmadd213pd (%2),%%ymm0,%%ymm1; vfmadd213pd 32(%2),%%ymm0,%%ymm2; vmovupd %%ymm1,(%2); vmovupd %%ymm2,32(%2);" +#define unit_save_m4n2(c1,c2) \ + "vperm2f128 $2,"#c1","#c2",%%ymm2; vperm2f128 $19,"#c1","#c2","#c2"; vmovapd %%ymm2,"#c1";"\ + "vunpcklpd "#c1","#c1",%%ymm2; vunpcklpd "#c2","#c2",%%ymm3;"\ + "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd 32(%5),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,32(%5);"\ + "vunpckhpd "#c1","#c1",%%ymm2; vunpckhpd "#c2","#c2",%%ymm3;"\ + "vfmadd213pd (%5,%3,1),%%ymm0,%%ymm2; vfmadd213pd 32(%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5,%3,1); vmovupd %%ymm3,32(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_h_m4n2 "movq %2,%5;" unit_save_m4n2(%%ymm4,%%ymm5) +#define SAVE_h_m4n4 SAVE_h_m4n2 unit_save_m4n2(%%ymm6,%%ymm7) +#define SAVE_h_m4n8 SAVE_h_m4n4 unit_save_m4n2(%%ymm8,%%ymm9) unit_save_m4n2(%%ymm10,%%ymm11) +#define SAVE_h_m4n12 SAVE_h_m4n8 unit_save_m4n2(%%ymm12,%%ymm13) unit_save_m4n2(%%ymm14,%%ymm15) +#define SAVE_m4(ndim) SAVE_h_m4n##ndim "addq $64,%2;" +#define COMPUTE_m4(ndim) \ + INIT_m4n##ndim\ + "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\ + "cmpq $24,%4; jb "#ndim"004042f;"\ + #ndim"004041:\n\t"\ + "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\ + "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\ + "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "prefetcht1 (%8); addq $32,%8;"\ + "subq $8,%4; cmpq $24,%4; jnb "#ndim"004041b;"\ + "movq %2,%5;"\ + #ndim"004042:\n\t"\ + "testq %4,%4; jz "#ndim"004043f;"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + KERNEL_k1m4n##ndim\ + "decq %4; jmp "#ndim"004042b;"\ + #ndim"004043:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ + SAVE_m4(ndim) + +/* m = 2 *//* vmm0 for alpha, vmm1-vmm3 for temporary use, vmm4-vmm9 for accumulators */ +#define KERNEL_k1m2n1 \ + "vmovupd (%0),%%xmm1; addq $16,%0;"\ + "vmovddup (%1),%%xmm2; vfmadd231pd %%xmm1,%%xmm2,%%xmm4;"\ + "addq $8,%1;" +#define KERNEL_h_k1m2n2 \ + "vmovddup (%0),%%xmm1; vmovddup 8(%0),%%xmm2; addq $16,%0;"\ + "vmovupd (%1),%%xmm3; vfmadd231pd %%xmm1,%%xmm3,%%xmm4; vfmadd231pd %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" +#define unit_kernel_k1m2n4(c1,c2,...) \ + "vmovupd ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";" +#define KERNEL_h_k1m2n4 \ + "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; addq $16,%0;"\ + unit_kernel_k1m2n4(%%ymm4,%%ymm5,%1) +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;" +#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 \ + unit_kernel_k1m2n4(%%ymm6,%%ymm7,%1,%%r12,1) +#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $32,%1;" +#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n8 \ + unit_kernel_k1m2n4(%%ymm8,%%ymm9,%1,%%r12,2) +#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $32,%1;" +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define unit_init_m2n4(c1,c2) "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";" +#define INIT_m2n4 unit_init_m2n4(%%ymm4,%%ymm5) +#define INIT_m2n8 INIT_m2n4 unit_init_m2n4(%%ymm6,%%ymm7) +#define INIT_m2n12 INIT_m2n8 unit_init_m2n4(%%ymm8,%%ymm9) +#define SAVE_h_m2n1 \ + "vinsertf128 $1,%%xmm4,%%ymm4,%%ymm4; vpermilpd $12,%%ymm4,%%ymm4; vfmadd213pd (%2),%%ymm0,%%ymm4; vmovupd %%ymm4,(%2);" +#define SAVE_h_m2n2 \ + "vinsertf128 $1,%%xmm5,%%ymm4,%%ymm4; vunpcklpd %%ymm4,%%ymm4,%%ymm1; vunpckhpd %%ymm4,%%ymm4,%%ymm2;"\ + "vfmadd213pd (%2),%%ymm0,%%ymm1; vmovupd %%ymm1,(%2);"\ + "vfmadd213pd (%2,%3,1),%%ymm0,%%ymm2; vmovupd %%ymm2,(%2,%3,1);" +#define unit_save_m2n4(c1,c2) \ + "vperm2f128 $2,"#c1","#c2",%%ymm1; vunpcklpd %%ymm1,%%ymm1,%%ymm2; vunpckhpd %%ymm1,%%ymm1,%%ymm3;"\ + "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd (%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vperm2f128 $19,"#c1","#c2",%%ymm1; vunpcklpd %%ymm1,%%ymm1,%%ymm2; vunpckhpd %%ymm1,%%ymm1,%%ymm3;"\ + "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd (%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%ymm4,%%ymm5) +#define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%ymm6,%%ymm7) +#define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%ymm8,%%ymm9) +#define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $32,%2;" +#define COMPUTE_m2(ndim) \ + INIT_m2n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"002022:\n\t"\ + "testq %4,%4; jz "#ndim"002023f;"\ + KERNEL_k1m2n##ndim\ + "decq %4; jmp "#ndim"002022b;"\ + #ndim"002023:\n\t"\ + SAVE_m2(ndim) + +/* m = 1 *//* vmm0 for alpha, vmm1-vmm3 and vmm10-vmm15 for temporary use, vmm4-vmm6 for accumulators */ +#define KERNEL_k1m1n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vfmadd231sd (%1),%%xmm1,%%xmm4; addq $8,%1;" +#define KERNEL_k1m1n2 \ + "vmovddup (%0),%%xmm1; addq $8,%0;"\ + "vfmadd231pd (%1),%%xmm1,%%xmm4; addq $16,%1;" +#define unit_kernel_k1m1n4(c1,...) \ + "vmovupd ("#__VA_ARGS__"),%%ymm2; vfmadd231pd %%ymm1,%%ymm2,"#c1";" +#define KERNEL_h_k1m1n4 \ + "vbroadcastsd (%0),%%ymm1; addq $8,%0;"\ + unit_kernel_k1m1n4(%%ymm4,%1) +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;" +#define KERNEL_h_k1m1n8 KERNEL_h_k1m1n4 unit_kernel_k1m1n4(%%ymm5,%1,%%r12,1) +#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $32,%1;" +#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n8 unit_kernel_k1m1n4(%%ymm6,%1,%%r12,2) +#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $32,%1;" +#define INIT_m1n1 INIT_m2n1 +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m1n8 INIT_m1n4 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%ymm6,%%ymm6,%%ymm6;" +#define SAVE_h_m1n1 \ + "vmovddup %%xmm4,%%xmm4; vfmadd213pd (%2),%%xmm0,%%xmm4; vmovupd %%xmm4,(%2);" +#define SAVE_h_m1n2 \ + "vunpcklpd %%xmm4,%%xmm4,%%xmm1; vunpckhpd %%xmm4,%%xmm4,%%xmm2;"\ + "vfmadd213pd (%2),%%xmm0,%%xmm1; vmovupd %%xmm1,(%2);"\ + "vfmadd213pd (%2,%3,1),%%xmm0,%%xmm2; vmovupd %%xmm2,(%2,%3,1);" +#define unit_save_m1n4(c1) \ + "vunpcklpd "#c1","#c1",%%ymm1; vunpckhpd "#c1","#c1",%%ymm2;"\ + "vmovupd (%5),%%xmm3; vinsertf128 $1,(%5,%3,2),%%ymm3,%%ymm3;"\ + "vfmadd213pd %%ymm3,%%ymm0,%%ymm1; vmovupd %%xmm1,(%5); vextractf128 $1,%%ymm1,(%5,%3,2); addq %3,%5;"\ + "vmovupd (%5),%%xmm3; vinsertf128 $1,(%5,%3,2),%%ymm3,%%ymm3;"\ + "vfmadd213pd %%ymm3,%%ymm0,%%ymm2; vmovupd %%xmm2,(%5); vextractf128 $1,%%ymm2,(%5,%3,2); addq %3,%5; leaq (%5,%3,2),%5;" +#define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%ymm4) +#define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%ymm5) +#define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%ymm6) +#define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $16,%2;" +#define COMPUTE_m1(ndim) \ + INIT_m1n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"001011:\n\t"\ + "testq %4,%4; jz "#ndim"001012f;"\ + KERNEL_k1m1n##ndim\ + "decq %4; jmp "#ndim"001011b;"\ + #ndim"001012:\n\t"\ + SAVE_m1(ndim) + +#define COMPUTE(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastf128 (%6),%%ymm0;"\ + "movq %4,%%r13; movq %4,%%r12; salq $5,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $4,%7;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m4(ndim)\ + "subq $4,%7;cmpq $4,%7;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $2,%7;jb 33104"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%7;"\ + "33104"#ndim":\n\t"\ + "testq %7,%7;jz 33105"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ + ::"r11","r12","r13","r14","r15","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14",\ + "ymm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ +} +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; + double constval[2]; constval[0] = alphar; constval[1] = alphai; + double *const_val=constval; + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + double *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From 109e18cd96707e1fab40b2777bf35e8257f540d2 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 16:03:24 +0800 Subject: [PATCH 825/935] Update KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index bdebd22b9..9e30c12f2 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -98,5 +98,5 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c -ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c From f60840c4207f8ebdfedfd987505013ed3dcbb3d2 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 16:04:23 +0800 Subject: [PATCH 826/935] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 025db515e..98cd38dfa 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -95,5 +95,5 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c -ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c From 6362c34ee60490a10f11b0adba1c9c84579c682e Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 16:08:19 +0800 Subject: [PATCH 827/935] Update param.h --- param.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/param.h b/param.h index 4084c781d..d03e60fcb 100644 --- a/param.h +++ b/param.h @@ -695,14 +695,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM3M_DEFAULT_UNROLL_N 4 #define CGEMM3M_DEFAULT_UNROLL_M 8 -#define ZGEMM3M_DEFAULT_UNROLL_N 8 -#define ZGEMM3M_DEFAULT_UNROLL_M 2 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 #define CGEMM3M_DEFAULT_P 320 -#define ZGEMM3M_DEFAULT_P 224 +#define ZGEMM3M_DEFAULT_P 256 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 320 -#define ZGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 256 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 @@ -1598,14 +1598,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM3M_DEFAULT_UNROLL_N 4 #define CGEMM3M_DEFAULT_UNROLL_M 8 -#define ZGEMM3M_DEFAULT_UNROLL_N 8 -#define ZGEMM3M_DEFAULT_UNROLL_M 2 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 #define CGEMM3M_DEFAULT_P 320 -#define ZGEMM3M_DEFAULT_P 224 +#define ZGEMM3M_DEFAULT_P 256 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 320 -#define ZGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 256 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 From aae44d040db343ee2fab98fa600c192cc271e73a Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 16:10:08 +0800 Subject: [PATCH 828/935] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index fd759913d..3d7617f92 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -178,4 +178,4 @@ In chronological order: * [2019-11-06] optimize AVX512 SGEMM * [2019-11-12] AVX512 CGEMM & ZGEMM kernels * [2019-12-23] optimize AVX2 CGEMM and ZGEMM - * [2019-12-27] AVX2 CGEMM3M kernel + * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernel From bb2729c85521310f26ed08792711c2b9c8cf3299 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 16:11:37 +0800 Subject: [PATCH 829/935] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3d7617f92..9829c31f9 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -178,4 +178,4 @@ In chronological order: * [2019-11-06] optimize AVX512 SGEMM * [2019-11-12] AVX512 CGEMM & ZGEMM kernels * [2019-12-23] optimize AVX2 CGEMM and ZGEMM - * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernel + * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels From 700fe5b5ee4866b9ff791a5952f5e6ae8afa2d04 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 17:18:59 +0800 Subject: [PATCH 830/935] Add files via upload --- kernel/x86_64/zgemm3m_kernel_4x4_haswell.c | 38 ++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c index 7b5b835c8..c57dccd36 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c +++ b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c @@ -18,13 +18,26 @@ #define KERNEL_h_k1m4n4 \ KERNEL_h_k1m4n2 "vbroadcastf128 16(%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm6; vfmadd231pd %%ymm2,%%ymm3,%%ymm7;" #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;" -#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ - "vbroadcastf128 ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\ - "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";" -#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1) +#define unit_kernel_k1m4n4(c1,c2,c3,c4,off1,off2,...) \ + "vbroadcastf128 "#off1"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastf128 "#off2"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1) #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%1;" -#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2) +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2) #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%1;" +#define KERNEL_k2m4n1 KERNEL_k1m4n1 KERNEL_k1m4n1 +#define KERNEL_k2m4n2 KERNEL_k1m4n2 KERNEL_k1m4n2 +#define KERNEL_k2m4n4 KERNEL_k1m4n4 KERNEL_k1m4n4 +#define KERNEL_k2m4n8 KERNEL_k1m4n8 KERNEL_k1m4n8 +#define KERNEL_k2m4n12 \ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;"\ + unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,16,%1)\ + unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)\ + unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)\ + "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\ + unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,32,48,%1)\ + unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,32,48,%1,%%r12,1)\ + unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,32,48,%1,%%r12,2) "addq $64,%1;" #define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" #define INIT_m4n2 INIT_m4n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" #define INIT_m4n4 INIT_m4n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" @@ -53,18 +66,17 @@ "cmpq $24,%4; jb "#ndim"004042f;"\ #ndim"004041:\n\t"\ "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\ - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ - "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\ - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ - "prefetcht1 (%8); addq $32,%8;"\ - "subq $8,%4; cmpq $24,%4; jnb "#ndim"004041b;"\ + KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\ + "prefetcht1 (%5); subq $63,%5;"\ + KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\ + "addq %%r15,%5; prefetcht1 (%8); addq $32,%8;"\ + "subq $8,%4; cmpq $16,%4; jnb "#ndim"004041b;"\ "movq %2,%5;"\ #ndim"004042:\n\t"\ "testq %4,%4; jz "#ndim"004043f;"\ - "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + "prefetcht0 (%5); prefetcht0 63(%5);"\ KERNEL_k1m4n##ndim\ + "prefetcht0 (%5,%3,4); prefetcht0 63(%5,%3,4); addq %3,%5;"\ "decq %4; jmp "#ndim"004042b;"\ #ndim"004043:\n\t"\ "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ From a0f0a802fcb895ae533f167f60667cc808e79b65 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 30 Dec 2019 17:33:42 +0800 Subject: [PATCH 831/935] Update zgemm3m_kernel_4x4_haswell.c --- kernel/x86_64/zgemm3m_kernel_4x4_haswell.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c index c57dccd36..56bc06c5c 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c +++ b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c @@ -201,8 +201,8 @@ "33105"#ndim":\n\t"\ "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ - ::"r11","r12","r13","r14","r15","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14",\ - "ymm15","cc","memory");\ + ::"r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14",\ + "xmm15","cc","memory");\ a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ } int __attribute__ ((noinline)) From 50f7fc1401b77c105c6681b3aefb8ce7555b7831 Mon Sep 17 00:00:00 2001 From: zq Date: Tue, 31 Dec 2019 10:21:23 +0800 Subject: [PATCH 832/935] [WIP] Use arm neon instructions to optimize tcopy operation --- kernel/arm64/KERNEL.ARMV8 | 8 + kernel/arm64/KERNEL.TSV110 | 8 + kernel/arm64/sgemm_tcopy_16.S | 824 ++++++++++++++++++++++++++++++++++ 3 files changed, 840 insertions(+) create mode 100644 kernel/arm64/sgemm_tcopy_16.S diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index b90dd228b..28eff773f 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -108,12 +108,20 @@ SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 index 04d6940d7..8c31f83b1 100644 --- a/kernel/arm64/KERNEL.TSV110 +++ b/kernel/arm64/KERNEL.TSV110 @@ -110,12 +110,20 @@ SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S new file mode 100644 index 000000000..12b80bdca --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -0,0 +1,824 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x18 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + //prfm PSTL1KEEP, [B00, M8] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] + add A05, A05, #64 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] + add A06, A06, #64 + + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] + add A07, A07, #64 + + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] + add A08, A08, #64 + + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 + +.endm + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] + add B01, B01, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + add B02, B02, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] + add B02, B02, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + add B03, B03, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B03] + add B03, B03, #16 + stp d6, d7, [B03] + add B03, B03, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ldr d6, [A07], #8 + ldr d7, [A08], #8 + + stp s4, s5, [B04] + add B04, B04, #8 + stp s6, s7, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ +.macro COPY16x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + + add B00, B00, M8 +.endm + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + + add B02, B02, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + + add B03, B03, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add B00, B00, M8 +.endm + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B02] + add B02, B02, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B03] + add B03, B03, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B04] + + add B04, B04, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B01] + + add B01, B01, #32 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B02] + + add B02, B02, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B03] + + add B03, B03, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B04] + + add B04, B04, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-16 + and B02 , N , #-8 + and B03 , N , #-4 + and B04 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + mul B04, B04, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + add B04 , B04, B + + lsl M8, M, #6 // M8 = M * 16 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #512 // B = B + 8 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M16_40 + + .align 5 +.Lsgemm_tcopy_L8_M16_20: + + COPY16x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M16_20 + +.Lsgemm_tcopy_L8_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L8_M16_60 + + COPY8x8 + +.Lsgemm_tcopy_L8_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L8_M16_80 + + COPY4x8 + +.Lsgemm_tcopy_L8_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M16_100 + + COPY2x8 + +.Lsgemm_tcopy_L8_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M16_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M16_END: + + subs J , J, #1 // j-- + bne .Lsgemm_tcopy_L8_M16_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #256 // B = B + 4 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M16_40 + + .align 5 +.Lsgemm_tcopy_L4_M16_20: + + COPY16x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M16_20 + +.Lsgemm_tcopy_L4_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L4_M16_60 + + COPY8x4 + +.Lsgemm_tcopy_L4_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L4_M16_80 + + COPY4x4 + +.Lsgemm_tcopy_L4_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M16_100 + + COPY2x4 + + +.Lsgemm_tcopy_L4_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L4_M16_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #128 // B = B + 2 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M16_40 + + .align 5 +.Lsgemm_tcopy_L2_M16_20: + + COPY16x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M16_20 + +.Lsgemm_tcopy_L2_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L2_M16_60 + + COPY8x2 + +.Lsgemm_tcopy_L2_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L2_M16_80 + + COPY4x2 + +.Lsgemm_tcopy_L2_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M16_100 + + COPY2x2 + +.Lsgemm_tcopy_L2_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M16_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #4 // I = M / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M16_40 + + .align 5 +.Lsgemm_tcopy_L1_M16_20: + + COPY16x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M16_20 + +.Lsgemm_tcopy_L1_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L1_M16_60 + + COPY8x1 + +.Lsgemm_tcopy_L1_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L1_M16_80 + + COPY4x1 + +.Lsgemm_tcopy_L1_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M16_100 + + COPY2x1 + +.Lsgemm_tcopy_L1_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M16_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M16_END: + +.Lsgemm_tcopy_L999: + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE + + From 0833a4846ac77ee21a6e2100a51b4e31f3d0b9c7 Mon Sep 17 00:00:00 2001 From: w00421467 Date: Tue, 31 Dec 2019 10:31:07 +0800 Subject: [PATCH 833/935] Use arm neon instructions to optimize sgemm_beta operation --- kernel/arm64/KERNEL.ARMV8 | 1 + kernel/arm64/sgemm_beta.S | 259 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100755 kernel/arm64/sgemm_beta.S diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index b90dd228b..587ee25c6 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -103,6 +103,7 @@ ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S new file mode 100755 index 000000000..a3b97e231 --- /dev/null +++ b/kernel/arm64/sgemm_beta.S @@ -0,0 +1,259 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define BETA s0 +#define LDC x6 +#define C00 x7 + +#define A01 x8 +#define A02 x9 +#define A03 x10 +#define A04 x11 +#define I x12 + +#define beta0 s11 +#define betaV0 v11.s[0] + +#define prfm_size 640 +#define calc_size 128 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro INIT_ZERO + fmul v0.4s, v0.4s, betaV0 + fmul v1.4s, v1.4s, betaV0 + fmul v2.4s, v2.4s, betaV0 + fmul v3.4s, v3.4s, betaV0 + fmul v4.4s, v4.4s, betaV0 + fmul v5.4s, v5.4s, betaV0 + fmul v6.4s, v6.4s, betaV0 + fmul v7.4s, v7.4s, betaV0 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr LDC, [sp] + SAVE_REGS + +.Lgemm_beta_BEGIN: + + fmov beta0, BETA + cmp N, #0 + ble .Lgemm_beta_L999 + + fcmp BETA, #0.0 + beq .Lgemm_beta_zero_01 + +.Lgemm_beta_01: + + lsl LDC, LDC, #2 + + .align 5 +.Lgemm_beta_02: + + mov A01, C00 + add C00, C00, LDC + asr I, M, #5 + cmp I, #0 + ble .Lgemm_beta_04 + add A02, A01, #32 + add A03, A02, #32 + add A04, A03, #32 + + .align 5 +.Lgemm_beta_03: + + prfm PLDL1KEEP, [A01, prfm_size] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + ldp q4, q5, [A03] + ldp q6, q7, [A04] + + fmul v0.4s, v0.4s, betaV0 + fmul v1.4s, v1.4s, betaV0 + + fmul v2.4s, v2.4s, betaV0 + fmul v3.4s, v3.4s, betaV0 + + fmul v4.4s, v4.4s, betaV0 + fmul v5.4s, v5.4s, betaV0 + + fmul v6.4s, v6.4s, betaV0 + fmul v7.4s, v7.4s, betaV0 + + prfm PLDL1KEEP, [A01, prfm_size + 64] + + st1 {v0.4s, v1.4s}, [A01] + add A01, A01, calc_size + st1 {v2.4s, v3.4s}, [A02] + add A02, A02, calc_size + st1 {v4.4s, v5.4s}, [A03] + add A03, A03, calc_size + st1 {v6.4s, v7.4s}, [A04] + add A04, A04, calc_size + + subs I , I , #1 + bne .Lgemm_beta_03 + + .align 5 +.Lgemm_beta_04: + + and I, M , #31 + cmp I, #0 + ble .Lgemm_beta_06 + + .align 5 +.Lgemm_beta_05: + + ldr s12, [A01] + fmul s12, s12, beta0 + str s12, [A01] + add A01, A01, #4 + + subs I , I , #1 + bne .Lgemm_beta_05 + + .align 5 +.Lgemm_beta_06: + + subs N , N, #1 // N-- + bne .Lgemm_beta_02 + + .align 5 +.Lgemm_beta_L999: + + mov x0, #0 + RESTORE_REGS + ret + + .align 5 +.Lgemm_beta_zero_01: + + INIT_ZERO + lsl LDC, LDC, #2 + + .align 5 +.Lgemm_beta_zero_02: + + mov A01, C00 + add C00, C00, LDC + + asr I, M, #5 + cmp I, #0 + ble .Lgemm_beta_zero_04 + add A02, A01, #32 + add A03, A02, #32 + add A04, A03, #32 + + .align 5 +.Lgemm_beta_zero_03: + + st1 {v0.4s, v1.4s}, [A01] + add A01, A01, calc_size + st1 {v2.4s, v3.4s}, [A02] + add A02, A02, calc_size + st1 {v4.4s, v5.4s}, [A03] + add A03, A03, calc_size + st1 {v6.4s, v7.4s}, [A04] + add A04, A04, calc_size + + subs I, I, #1 + bne .Lgemm_beta_zero_03 + + .align 5 +.Lgemm_beta_zero_04: + + and I, M, #31 + cmp I, #0 + ble .Lgemm_beta_zero_06 + + .align 5 +.Lgemm_beta_zero_05: + + str beta0, [A01] + add A01, A01, #4 + + subs I, I, #1 + bne .Lgemm_beta_zero_05 + + .align 5 +.Lgemm_beta_zero_06: + + subs N, N, #1 + bne .Lgemm_beta_zero_02 + + .align 5 +.Lgemm_beta_zero_L999: + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE From 8729db117ced2094d688b38f85fc2cf1faf7fde7 Mon Sep 17 00:00:00 2001 From: shengyang Date: Tue, 31 Dec 2019 15:59:52 +0800 Subject: [PATCH 834/935] modified: ctest/din3 modified: ctest/sin3 --- ctest/din3 | 2 +- ctest/sin3 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/din3 b/ctest/din3 index 23fedfe32..9919774ac 100644 --- a/ctest/din3 +++ b/ctest/din3 @@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO -6 NUMBER OF VALUES OF N +7 NUMBER OF VALUES OF N 1 2 3 5 7 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA diff --git a/ctest/sin3 b/ctest/sin3 index 644083f22..b74206b70 100644 --- a/ctest/sin3 +++ b/ctest/sin3 @@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO -6 NUMBER OF VALUES OF N +7 NUMBER OF VALUES OF N 0 1 2 3 5 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA From 8d844032056bc3533a396c16f6386df2b9173eb4 Mon Sep 17 00:00:00 2001 From: shengyang Date: Tue, 31 Dec 2019 17:06:35 +0800 Subject: [PATCH 835/935] Use arm neon instructions to optimize ncopy operation modified: KERNEL.ARMV8 modified: KERNEL.TSV110 new file: sgemm_ncopy_4.S --- kernel/arm64/KERNEL.ARMV8 | 9 + kernel/arm64/KERNEL.TSV110 | 9 + kernel/arm64/sgemm_ncopy_4.S | 333 +++++++++++++++++++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 kernel/arm64/sgemm_ncopy_4.S diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index b90dd228b..e73bed76e 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -107,12 +107,21 @@ DGEMM_BETA = dgemm_beta.S SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif + +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 index 04d6940d7..0db068dcf 100644 --- a/kernel/arm64/KERNEL.TSV110 +++ b/kernel/arm64/KERNEL.TSV110 @@ -109,12 +109,21 @@ ZGEMVTKERNEL = zgemv_t.S SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif + +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S new file mode 100644 index 000000000..30450cc7d --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_4.S @@ -0,0 +1,333 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 + +#define I x9 +#define J x10 + +#define TEMP1 x11 +#define TEMP2 x12 + +#define A_PREFETCH 2560 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + + ldr q3, [A04], #16 + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] + add B00, B00, #64 + +.endm + +.macro COPY1x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ldr s2, [A03], #4 + ldr s3, [A04], #4 + + stp s0, s1, [B00] + add B00, B00, #8 + stp s2, s3, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] + add B00, B00, #32 +.endm + + +.macro COPY1x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + + stp s0, s1, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01], #16 + str q0, [B00], #16 +.endm + + +.macro COPY1x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Ldgemm_ncopy_L4_BEGIN: + + asr J, N, #2 // J = N / 4 + cmp J, #0 + ble .Ldgemm_ncopy_L2_BEGIN + + .align 5 +.Ldgemm_ncopy_L4_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_40 + + .align 5 +.Ldgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_20 + +.Ldgemm_ncopy_L4_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_END + + .align 5 +.Ldgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_60 + +.Ldgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne .Ldgemm_ncopy_L4_M4_BEGIN + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble .Ldgemm_ncopy_L999 + + tst N, #2 + ble .Ldgemm_ncopy_L1_BEGIN + +.Ldgemm_ncopy_L2_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_40 + + .align 5 +.Ldgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_20 + +.Ldgemm_ncopy_L2_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_END + + .align 5 +.Ldgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_60 + +.Ldgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Ldgemm_ncopy_L999 + +.Ldgemm_ncopy_L1_M4_BEGIN: + + mov A01, A00 + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_40 + + .align 5 +.Ldgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_20 + + +.Ldgemm_ncopy_L1_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_END + + .align 5 +.Ldgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_60 + + +.Ldgemm_ncopy_L1_M4_END: + +.Ldgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE + From 96ad5794284b54331afe0db07b6471913d96f2a9 Mon Sep 17 00:00:00 2001 From: int_13h <30789322+nk521@users.noreply.github.com> Date: Tue, 31 Dec 2019 22:33:27 +0530 Subject: [PATCH 836/935] add in runtime cpu detection for zarch (#2349) add in runtime cpu detection for zarch --- Makefile.system | 7 ++ driver/others/Makefile | 8 ++ driver/others/dynamic_zarch.c | 131 ++++++++++++++++++++++++++++++ kernel/setparam-ref.c | 21 +++++ kernel/zarch/KERNEL.Z13 | 20 ++--- kernel/zarch/KERNEL.Z14 | 20 ++--- kernel/zarch/KERNEL.ZARCH_GENERIC | 16 ++-- 7 files changed, 195 insertions(+), 28 deletions(-) create mode 100644 driver/others/dynamic_zarch.c diff --git a/Makefile.system b/Makefile.system index ab2ffca52..c0e45515f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -25,6 +25,8 @@ else ifeq ($(ARCH), i386) override ARCH=x86 else ifeq ($(ARCH), aarch64) override ARCH=arm64 +else ifeq ($(ARCH), zarch) +override ARCH=zarch endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib @@ -558,6 +560,11 @@ DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 endif +ifeq ($(ARCH), zarch) +DYNAMIC_CORE = Z13 +DYNAMIC_CORE += Z14 +endif + ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 diff --git a/driver/others/Makefile b/driver/others/Makefile index d4b5c26d5..5653f3c25 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -21,9 +21,13 @@ else ifeq ($(ARCH),power) COMMONOBJS += dynamic_power.$(SUFFIX) else +ifeq ($(ARCH),zarch) +COMMONOBJS += dynamic_zarch.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -85,9 +89,13 @@ else ifeq ($(ARCH),power) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX) else +ifeq ($(ARCH),zarch) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c new file mode 100644 index 000000000..1206bf870 --- /dev/null +++ b/driver/others/dynamic_zarch.c @@ -0,0 +1,131 @@ + +#include "common.h" + +extern gotoblas_t gotoblas_Z13; +extern gotoblas_t gotoblas_Z14; +extern gotoblas_t gotoblas_Z15; +//#if (!defined C_GCC) || (GCC_VERSION >= 60000) +//extern gotoblas_t gotoblas_Z14; +//#endif + +#define NUM_CORETYPES 5 + +extern void openblas_warning(int verbose, const char* msg); + +static char* corename[] = { + "unknown", + "Z13", + "Z14", + "Z15", + "ZARCH_GENERIC", +}; + +char* gotoblas_corename(void) { + if (gotoblas == &gotoblas_Z13) return corename[1]; + if (gotoblas == &gotoblas_Z14) return corename[2]; + if (gotoblas == &gotoblas_Z15) return corename[3]; +//#if (!defined C_GCC) || (GCC_VERSION >= 60000) +// if (gotoblas == &gotoblas_POWER9) return corename[3]; +//#endif + return corename[0]; // try generic? +} + +// __builtin_cpu_is is not supported by zarch +static gotolabs_t* get_coretype(void) { + FILE* infile; + char buffer[512], * p; + + p = (char*)NULL; + infile = fopen("/proc/sysinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)) { + if (!strncmp("Type", buffer, 4)) { + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if (strstr(p, "2964")) return &gotoblas_Z13; + if (strstr(p, "2965")) return &gotoblas_Z13; + if (strstr(p, "3906")) return &gotoblas_Z14; + if (strstr(p, "3907")) return &gotoblas_Z14; + if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 + if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 + + return NULL; // should be ZARCH_GENERIC +} + +static gotoblas_t* force_coretype(char* coretype) { + + int i; + int found = -1; + char message[128]; + + for (i = 0; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 1: return (&gotoblas_Z13); + case 2: return (&gotoblas_Z14); + case 3: return (&gotoblas_Z15); +//#if (!defined C_GCC) || (GCC_VERSION >= 60000) +// case 3: return (&gotoblas_POWER9); +//#endif + default: return NULL; + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char* p; + + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if (p) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to Z14 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_Z14; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas->init(); + } + else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 8e8214e70..3c71c778e 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -739,6 +739,26 @@ static void init_parameter(void) { } #else //POWER +#if defined(ARCH_ZARCH) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +} +#else //ZARCH + #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1325,4 +1345,5 @@ static void init_parameter(void) { } #endif //POWER +#endif //ZARCH #endif //defined(ARCH_ARM64) diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index b1ffd3c54..3bcc32197 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -96,10 +96,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -108,16 +108,16 @@ DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ctrmm4x4V.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ztrmm4x4V.S ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index 971896c2d..f6e3bec23 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -96,10 +96,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -108,16 +108,16 @@ DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ctrmm4x4V.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ztrmm4x4V.S ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC index 3bbeb9155..33850d0f7 100644 --- a/kernel/zarch/KERNEL.ZARCH_GENERIC +++ b/kernel/zarch/KERNEL.ZARCH_GENERIC @@ -94,26 +94,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c From 375b1875c8c1d1d59d1bb6f2c227e6da12563faf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Jan 2020 13:18:53 +0100 Subject: [PATCH 837/935] [WIP] Update LAPACK to 3.9.0 (#2353) * Update make.inc entries for LAPACK 3.9.0 Reference-LAPACK PR 347 changed some variable names and relative paths * Update LAPACK to 3.9.0 * Add new functions from LAPACK 3.9.0 * Add new functions from LAPACK 3.9.0 * Restore LOADER command as it makes it easier to specify pthread as needed * Restore LOADER * Restore EIG/LIN prefixes in cmdbase * add binary path to lapack_testing.py call * Restore OpenMP version check * Restore OpenMP version check * Restore fix for out-of-bounds array accesses from #2096 --- Makefile | 20 +- cmake/lapack.cmake | 16 +- cmake/lapacke.cmake | 6 + exports/gensymbol | 31 +- lapack-netlib/.appveyor.yml | 38 + lapack-netlib/.gitignore | 6 + lapack-netlib/.travis.yml | 49 +- lapack-netlib/BLAS/CMakeLists.txt | 1 + lapack-netlib/BLAS/Makefile | 7 +- lapack-netlib/BLAS/SRC/Makefile | 21 +- lapack-netlib/BLAS/SRC/icamax.f | 2 +- lapack-netlib/BLAS/SRC/idamax.f | 2 +- lapack-netlib/BLAS/SRC/izamax.f | 2 +- lapack-netlib/BLAS/SRC/meson.build | 29 + lapack-netlib/BLAS/SRC/sdsdot.f | 146 +- lapack-netlib/BLAS/TESTING/Makefile | 33 +- lapack-netlib/BLAS/TESTING/cblat1.f | 2 +- lapack-netlib/BLAS/TESTING/dblat1.f | 2 +- lapack-netlib/BLAS/TESTING/sblat1.f | 2 +- lapack-netlib/BLAS/TESTING/zblat1.f | 2 +- lapack-netlib/CBLAS/CMakeLists.txt | 19 +- lapack-netlib/CBLAS/Makefile | 10 +- lapack-netlib/CBLAS/examples/Makefile | 16 +- lapack-netlib/CBLAS/examples/cblas_example1.c | 2 +- lapack-netlib/CBLAS/src/Makefile | 50 +- lapack-netlib/CBLAS/src/cblas_sgemm.c | 2 +- lapack-netlib/CBLAS/testing/Makefile | 40 +- lapack-netlib/CBLAS/testing/c_cblat1.f | 2 +- lapack-netlib/CBLAS/testing/c_dblat1.f | 2 +- lapack-netlib/CBLAS/testing/c_sblat1.f | 2 +- lapack-netlib/CBLAS/testing/c_zblat1.f | 2 +- .../CMAKE/CheckLAPACKCompilerFlags.cmake | 2 +- lapack-netlib/CMAKE/FindGcov.cmake | 2 +- lapack-netlib/CMAKE/Findcodecov.cmake | 2 +- lapack-netlib/CMAKE/FortranMangling.cmake | 2 +- .../CMAKE/lapack-config-build.cmake.in | 4 + .../CMAKE/lapack-config-install.cmake.in | 4 + lapack-netlib/CMakeLists.txt | 99 +- lapack-netlib/DOCS/Doxyfile | 2 +- lapack-netlib/DOCS/Doxyfile_man | 2 +- lapack-netlib/DOCS/lawn81.tex | 90 +- lapack-netlib/INSTALL/Makefile | 28 +- lapack-netlib/INSTALL/dlamch.f | 5 + lapack-netlib/INSTALL/dlamchf77.f | 4 + lapack-netlib/INSTALL/ilaver.f | 10 +- lapack-netlib/INSTALL/make.inc.ALPHA | 34 +- lapack-netlib/INSTALL/make.inc.HPPA | 34 +- lapack-netlib/INSTALL/make.inc.IRIX64 | 39 +- lapack-netlib/INSTALL/make.inc.O2K | 39 +- lapack-netlib/INSTALL/make.inc.SGI5 | 34 +- lapack-netlib/INSTALL/make.inc.SUN4 | 34 +- lapack-netlib/INSTALL/make.inc.SUN4SOL2 | 41 +- lapack-netlib/INSTALL/make.inc.XLF | 34 +- lapack-netlib/INSTALL/make.inc.gfortran | 34 +- lapack-netlib/INSTALL/make.inc.gfortran_debug | 34 +- lapack-netlib/INSTALL/make.inc.ifort | 34 +- lapack-netlib/INSTALL/make.inc.pgf95 | 34 +- lapack-netlib/INSTALL/make.inc.pghpf | 34 +- lapack-netlib/INSTALL/slamch.f | 1 + lapack-netlib/LAPACKE/CMakeLists.txt | 51 +- lapack-netlib/LAPACKE/Makefile | 14 +- .../cmake/lapacke-config-build.cmake.in | 5 +- .../cmake/lapacke-config-install.cmake.in | 5 +- lapack-netlib/LAPACKE/example/Makefile | 22 +- lapack-netlib/LAPACKE/include/CMakeLists.txt | 2 +- lapack-netlib/LAPACKE/include/lapack.h | 13715 ++++++++++++++++ lapack-netlib/LAPACKE/include/lapacke.h | 7025 +------- lapack-netlib/LAPACKE/src/CMakeLists.txt | 315 +- lapack-netlib/LAPACKE/src/Makefile | 355 +- lapack-netlib/LAPACKE/src/lapacke_cgejsv.c | 3 - lapack-netlib/LAPACKE/src/lapacke_cgelsd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c | 106 + .../LAPACKE/src/lapacke_cgesvdq_work.c | 149 + lapack-netlib/LAPACKE/src/lapacke_cggesx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chbevd.c | 2 +- .../LAPACKE/src/lapacke_chbevd_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chbgvd.c | 2 +- .../LAPACKE/src/lapacke_cheev_work.c | 4 +- lapack-netlib/LAPACKE/src/lapacke_cheevd.c | 4 +- .../LAPACKE/src/lapacke_cheevd_2stage.c | 4 +- .../LAPACKE/src/lapacke_cheevd_2stage_work.c | 4 +- .../LAPACKE/src/lapacke_cheevd_work.c | 5 +- lapack-netlib/LAPACKE/src/lapacke_cheevr.c | 2 +- .../LAPACKE/src/lapacke_cheevr_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegst.c | 2 +- .../LAPACKE/src/lapacke_chegst_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chpevd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chpgvd.c | 2 +- .../LAPACKE/src/lapacke_clantr_work.c | 19 +- lapack-netlib/LAPACKE/src/lapacke_cstedc.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cstegr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cstemr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_csytrs2.c | 2 +- .../LAPACKE/src/lapacke_csytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ctgsen.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ctprfb.c | 35 +- lapack-netlib/LAPACKE/src/lapacke_cunmhr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dgeesx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dgejsv.c | 3 - lapack-netlib/LAPACKE/src/lapacke_dgelsd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c | 106 + .../LAPACKE/src/lapacke_dgesvdq_work.c | 149 + lapack-netlib/LAPACKE/src/lapacke_dggesx.c | 2 +- .../LAPACKE/src/lapacke_dlantr_work.c | 19 +- lapack-netlib/LAPACKE/src/lapacke_dormhr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsbevd.c | 2 +- .../LAPACKE/src/lapacke_dsbevd_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dspevd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dspgvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dstedc.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dstegr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dstemr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dstevd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dstevr.c | 2 +- .../LAPACKE/src/lapacke_dsyev_work.c | 4 +- lapack-netlib/LAPACKE/src/lapacke_dsyevd.c | 4 +- .../LAPACKE/src/lapacke_dsyevd_2stage.c | 4 +- .../LAPACKE/src/lapacke_dsyevd_2stage_work.c | 4 +- .../LAPACKE/src/lapacke_dsyevd_work.c | 4 +- lapack-netlib/LAPACKE/src/lapacke_dsyevr.c | 2 +- .../LAPACKE/src/lapacke_dsyevr_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsygvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c | 2 +- .../LAPACKE/src/lapacke_dsytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dtgsen.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dtprfb.c | 37 +- lapack-netlib/LAPACKE/src/lapacke_dtrsen.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sgeesx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sgejsv.c | 3 - lapack-netlib/LAPACKE/src/lapacke_sgelsd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c | 106 + .../LAPACKE/src/lapacke_sgesvdq_work.c | 148 + lapack-netlib/LAPACKE/src/lapacke_sggesx.c | 2 +- .../LAPACKE/src/lapacke_slantr_work.c | 19 +- lapack-netlib/LAPACKE/src/lapacke_sormhr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssbevd.c | 2 +- .../LAPACKE/src/lapacke_ssbevd_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sspevd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sspgvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sstedc.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sstegr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sstemr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sstevd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sstevr.c | 2 +- .../LAPACKE/src/lapacke_ssyev_work.c | 4 +- lapack-netlib/LAPACKE/src/lapacke_ssyevd.c | 4 +- .../LAPACKE/src/lapacke_ssyevd_2stage.c | 4 +- .../LAPACKE/src/lapacke_ssyevd_2stage_work.c | 4 +- .../LAPACKE/src/lapacke_ssyevd_work.c | 4 +- lapack-netlib/LAPACKE/src/lapacke_ssyevr.c | 2 +- .../LAPACKE/src/lapacke_ssyevr_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssygvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c | 2 +- .../LAPACKE/src/lapacke_ssytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_stgsen.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_stprfb.c | 37 +- lapack-netlib/LAPACKE/src/lapacke_strsen.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zgejsv.c | 3 - lapack-netlib/LAPACKE/src/lapacke_zgelsd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c | 106 + .../LAPACKE/src/lapacke_zgesvdq_work.c | 149 + lapack-netlib/LAPACKE/src/lapacke_zggesx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhbevd.c | 2 +- .../LAPACKE/src/lapacke_zhbevd_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c | 2 +- .../LAPACKE/src/lapacke_zheev_work.c | 4 +- lapack-netlib/LAPACKE/src/lapacke_zheevd.c | 4 +- .../LAPACKE/src/lapacke_zheevd_2stage.c | 4 +- .../LAPACKE/src/lapacke_zheevd_2stage_work.c | 4 +- .../LAPACKE/src/lapacke_zheevd_work.c | 4 +- lapack-netlib/LAPACKE/src/lapacke_zheevr.c | 2 +- .../LAPACKE/src/lapacke_zheevr_2stage.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegst.c | 2 +- .../LAPACKE/src/lapacke_zhegst_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegvd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhpevd.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c | 2 +- .../LAPACKE/src/lapacke_zlantr_work.c | 19 +- lapack-netlib/LAPACKE/src/lapacke_zstedc.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zstegr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zstemr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c | 2 +- .../LAPACKE/src/lapacke_zsytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ztgsen.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ztprfb.c | 38 +- lapack-netlib/LAPACKE/src/lapacke_zunmhr.c | 2 +- lapack-netlib/LAPACKE/utils/Makefile | 17 +- .../LAPACKE/utils/lapacke_chp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_cpf_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_cpp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_csp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_ctp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_dpf_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_dpp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_dsp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_dtp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_spf_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_spp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_ssp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_stp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_zhp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_zpf_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_zpp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_zsp_nancheck.c | 2 +- .../LAPACKE/utils/lapacke_ztp_nancheck.c | 2 +- lapack-netlib/Makefile | 49 +- lapack-netlib/README.md | 13 +- lapack-netlib/SRC/CMakeLists.txt | 26 +- lapack-netlib/SRC/Makefile | 91 +- lapack-netlib/SRC/VARIANTS/Makefile | 22 +- lapack-netlib/SRC/VARIANTS/README | 12 +- lapack-netlib/SRC/cgbrfsx.f | 12 +- lapack-netlib/SRC/cgbsvxx.f | 10 +- lapack-netlib/SRC/cgebak.f | 8 +- lapack-netlib/SRC/cgeev.f | 2 +- lapack-netlib/SRC/cgejsv.f | 66 +- lapack-netlib/SRC/cgelq.f | 19 +- lapack-netlib/SRC/cgelq2.f | 18 +- lapack-netlib/SRC/cgelqf.f | 16 +- lapack-netlib/SRC/cgelqt.f | 1 + lapack-netlib/SRC/cgelqt3.f | 2 + lapack-netlib/SRC/cgemlq.f | 3 +- lapack-netlib/SRC/cgemlqt.f | 2 + lapack-netlib/SRC/cgemqr.f | 3 +- lapack-netlib/SRC/cgeqr.f | 20 +- lapack-netlib/SRC/cgeqr2.f | 19 +- lapack-netlib/SRC/cgeqr2p.f | 20 +- lapack-netlib/SRC/cgeqrf.f | 17 +- lapack-netlib/SRC/cgeqrfp.f | 20 +- lapack-netlib/SRC/cgerfsx.f | 12 +- lapack-netlib/SRC/cgesc2.f | 2 +- lapack-netlib/SRC/cgesvdq.f | 1391 ++ lapack-netlib/SRC/cgesvj.f | 46 +- lapack-netlib/SRC/cgesvxx.f | 10 +- lapack-netlib/SRC/cgetsls.f | 2 + lapack-netlib/SRC/cggesx.f | 8 +- lapack-netlib/SRC/cgsvj0.f | 18 +- lapack-netlib/SRC/cgsvj1.f | 20 +- lapack-netlib/SRC/chb2st_kernels.f | 70 +- lapack-netlib/SRC/checon_3.f | 9 +- lapack-netlib/SRC/cheevr.f | 2 +- lapack-netlib/SRC/cheevr_2stage.f | 2 +- lapack-netlib/SRC/chegs2.f | 1 + lapack-netlib/SRC/chegst.f | 1 + lapack-netlib/SRC/cherfsx.f | 12 +- lapack-netlib/SRC/chesv_aa.f | 6 +- lapack-netlib/SRC/chesv_aa_2stage.f | 4 +- lapack-netlib/SRC/chesvxx.f | 16 +- lapack-netlib/SRC/chetf2_rk.f | 4 +- lapack-netlib/SRC/chetrd_2stage.f | 13 +- lapack-netlib/SRC/chetrd_hb2st.F | 4 +- lapack-netlib/SRC/chetrd_he2hb.f | 2 +- lapack-netlib/SRC/chetrf_aa.f | 8 +- lapack-netlib/SRC/chetrf_aa_2stage.f | 34 +- lapack-netlib/SRC/chetri2.f | 4 +- lapack-netlib/SRC/chetrs_aa.f | 132 +- lapack-netlib/SRC/chetrs_aa_2stage.f | 10 +- lapack-netlib/SRC/chseqr.f | 44 +- lapack-netlib/SRC/cla_gbrcond_c.f | 4 +- lapack-netlib/SRC/cla_gbrcond_x.f | 4 +- lapack-netlib/SRC/cla_gbrfsx_extended.f | 12 +- lapack-netlib/SRC/cla_gercond_c.f | 8 +- lapack-netlib/SRC/cla_gercond_x.f | 4 +- lapack-netlib/SRC/cla_gerfsx_extended.f | 12 +- lapack-netlib/SRC/cla_hercond_c.f | 4 +- lapack-netlib/SRC/cla_hercond_x.f | 4 +- lapack-netlib/SRC/cla_herfsx_extended.f | 8 +- lapack-netlib/SRC/cla_porcond_c.f | 4 +- lapack-netlib/SRC/cla_porcond_x.f | 4 +- lapack-netlib/SRC/cla_porfsx_extended.f | 8 +- lapack-netlib/SRC/cla_porpvgrw.f | 2 +- lapack-netlib/SRC/cla_syrcond_c.f | 4 +- lapack-netlib/SRC/cla_syrcond_x.f | 4 +- lapack-netlib/SRC/cla_syrfsx_extended.f | 8 +- lapack-netlib/SRC/cla_syrpvgrw.f | 2 +- lapack-netlib/SRC/cla_wwaddw.f | 2 +- lapack-netlib/SRC/clahef_aa.f | 42 +- lapack-netlib/SRC/clahef_rk.f | 4 +- lapack-netlib/SRC/clahqr.f | 14 +- lapack-netlib/SRC/clamswlq.f | 1 + lapack-netlib/SRC/clamtsqr.f | 1 + lapack-netlib/SRC/clangb.f | 23 +- lapack-netlib/SRC/clange.f | 22 +- lapack-netlib/SRC/clanhb.f | 48 +- lapack-netlib/SRC/clanhe.f | 45 +- lapack-netlib/SRC/clanhp.f | 46 +- lapack-netlib/SRC/clanhs.f | 23 +- lapack-netlib/SRC/clansb.f | 42 +- lapack-netlib/SRC/clansp.f | 54 +- lapack-netlib/SRC/clansy.f | 40 +- lapack-netlib/SRC/clantb.f | 55 +- lapack-netlib/SRC/clantp.f | 53 +- lapack-netlib/SRC/clantr.f | 56 +- lapack-netlib/SRC/claqps.f | 2 +- lapack-netlib/SRC/claqr0.f | 34 +- lapack-netlib/SRC/claqr1.f | 2 +- lapack-netlib/SRC/claqr2.f | 18 +- lapack-netlib/SRC/claqr3.f | 18 +- lapack-netlib/SRC/claqr4.f | 34 +- lapack-netlib/SRC/claqr5.f | 52 +- lapack-netlib/SRC/clarfb.f | 2 + lapack-netlib/SRC/clarfx.f | 2 +- lapack-netlib/SRC/clarfy.f | 2 +- lapack-netlib/SRC/clarrv.f | 2 +- lapack-netlib/SRC/classq.f | 4 +- lapack-netlib/SRC/claswlq.f | 20 +- lapack-netlib/SRC/clasyf_aa.f | 48 +- lapack-netlib/SRC/clasyf_rk.f | 4 +- lapack-netlib/SRC/clatdf.f | 2 +- lapack-netlib/SRC/clatsqr.f | 25 +- lapack-netlib/SRC/claunhr_col_getrfnp.f | 248 + lapack-netlib/SRC/claunhr_col_getrfnp2.f | 314 + lapack-netlib/SRC/cporfsx.f | 16 +- lapack-netlib/SRC/cposvxx.f | 14 +- lapack-netlib/SRC/cpotrf2.f | 4 +- lapack-netlib/SRC/cstemr.f | 4 +- lapack-netlib/SRC/csycon_3.f | 9 +- lapack-netlib/SRC/csyconvf.f | 8 +- lapack-netlib/SRC/csyconvf_rook.f | 8 +- lapack-netlib/SRC/csyrfsx.f | 10 +- lapack-netlib/SRC/csysv_aa.f | 12 +- lapack-netlib/SRC/csysv_aa_2stage.f | 6 +- lapack-netlib/SRC/csysvxx.f | 10 +- lapack-netlib/SRC/csytf2_rk.f | 4 +- lapack-netlib/SRC/csytrf.f | 2 +- lapack-netlib/SRC/csytrf_aa.f | 12 +- lapack-netlib/SRC/csytrf_aa_2stage.f | 26 +- lapack-netlib/SRC/csytri2.f | 4 +- lapack-netlib/SRC/csytrs2.f | 2 +- lapack-netlib/SRC/csytrs_aa.f | 117 +- lapack-netlib/SRC/csytrs_aa_2stage.f | 18 +- lapack-netlib/SRC/ctgsy2.f | 18 +- lapack-netlib/SRC/ctplqt.f | 2 + lapack-netlib/SRC/ctplqt2.f | 2 + lapack-netlib/SRC/ctpmlqt.f | 4 +- lapack-netlib/SRC/ctpmqrt.f | 2 +- lapack-netlib/SRC/ctprfb.f | 4 +- lapack-netlib/SRC/cungtsqr.f | 307 + lapack-netlib/SRC/cunhr_col.f | 441 + lapack-netlib/SRC/dbdsqr.f | 2 +- lapack-netlib/SRC/dbdsvdx.f | 2 +- lapack-netlib/SRC/dcombssq.f | 92 + lapack-netlib/SRC/dgbrfsx.f | 10 +- lapack-netlib/SRC/dgbsvxx.f | 10 +- lapack-netlib/SRC/dgebak.f | 8 +- lapack-netlib/SRC/dgeesx.f | 4 +- lapack-netlib/SRC/dgejsv.f | 56 +- lapack-netlib/SRC/dgelq.f | 19 +- lapack-netlib/SRC/dgelq2.f | 18 +- lapack-netlib/SRC/dgelqf.f | 16 +- lapack-netlib/SRC/dgemlq.f | 3 +- lapack-netlib/SRC/dgemqr.f | 3 +- lapack-netlib/SRC/dgeqr.f | 20 +- lapack-netlib/SRC/dgeqr2.f | 19 +- lapack-netlib/SRC/dgeqr2p.f | 20 +- lapack-netlib/SRC/dgeqrf.f | 17 +- lapack-netlib/SRC/dgeqrfp.f | 20 +- lapack-netlib/SRC/dgerfsx.f | 10 +- lapack-netlib/SRC/dgesc2.f | 4 +- lapack-netlib/SRC/dgesdd.f | 4 +- lapack-netlib/SRC/dgesvdq.f | 1385 ++ lapack-netlib/SRC/dgesvj.f | 44 +- lapack-netlib/SRC/dgesvxx.f | 10 +- lapack-netlib/SRC/dgetc2.f | 2 +- lapack-netlib/SRC/dgetsls.f | 2 + lapack-netlib/SRC/dggesx.f | 8 +- lapack-netlib/SRC/dgsvj0.f | 20 +- lapack-netlib/SRC/dgsvj1.f | 30 +- lapack-netlib/SRC/dhseqr.f | 46 +- lapack-netlib/SRC/dla_gbrcond.f | 4 +- lapack-netlib/SRC/dla_gbrfsx_extended.f | 12 +- lapack-netlib/SRC/dla_gercond.f | 4 +- lapack-netlib/SRC/dla_gerfsx_extended.f | 12 +- lapack-netlib/SRC/dla_porcond.f | 4 +- lapack-netlib/SRC/dla_porfsx_extended.f | 8 +- lapack-netlib/SRC/dla_porpvgrw.f | 2 +- lapack-netlib/SRC/dla_syrcond.f | 4 +- lapack-netlib/SRC/dla_syrfsx_extended.f | 8 +- lapack-netlib/SRC/dla_syrpvgrw.f | 2 +- lapack-netlib/SRC/dla_wwaddw.f | 2 +- lapack-netlib/SRC/dlaed4.f | 2 +- lapack-netlib/SRC/dlaed8.f | 2 +- lapack-netlib/SRC/dlagtf.f | 6 +- lapack-netlib/SRC/dlagts.f | 20 +- lapack-netlib/SRC/dlahqr.f | 14 +- lapack-netlib/SRC/dlaln2.f | 2 +- lapack-netlib/SRC/dlamswlq.f | 1 + lapack-netlib/SRC/dlamtsqr.f | 1 + lapack-netlib/SRC/dlangb.f | 26 +- lapack-netlib/SRC/dlange.f | 22 +- lapack-netlib/SRC/dlanhs.f | 25 +- lapack-netlib/SRC/dlansb.f | 44 +- lapack-netlib/SRC/dlansp.f | 48 +- lapack-netlib/SRC/dlansy.f | 42 +- lapack-netlib/SRC/dlantb.f | 57 +- lapack-netlib/SRC/dlantp.f | 55 +- lapack-netlib/SRC/dlantr.f | 58 +- lapack-netlib/SRC/dlanv2.f | 8 +- lapack-netlib/SRC/dlaorhr_col_getrfnp.f | 248 + lapack-netlib/SRC/dlaorhr_col_getrfnp2.f | 305 + lapack-netlib/SRC/dlaqps.f | 2 +- lapack-netlib/SRC/dlaqr0.f | 38 +- lapack-netlib/SRC/dlaqr1.f | 2 +- lapack-netlib/SRC/dlaqr2.f | 18 +- lapack-netlib/SRC/dlaqr3.f | 18 +- lapack-netlib/SRC/dlaqr4.f | 38 +- lapack-netlib/SRC/dlaqr5.f | 52 +- lapack-netlib/SRC/dlarfb.f | 2 + lapack-netlib/SRC/dlarfx.f | 2 +- lapack-netlib/SRC/dlarfy.f | 2 +- lapack-netlib/SRC/dlarrb.f | 4 +- lapack-netlib/SRC/dlarre.f | 2 +- lapack-netlib/SRC/dlarrj.f | 2 +- lapack-netlib/SRC/dlarrv.f | 2 +- lapack-netlib/SRC/dlasd7.f | 2 +- lapack-netlib/SRC/dlasr.f | 2 +- lapack-netlib/SRC/dlassq.f | 2 +- lapack-netlib/SRC/dlaswlq.f | 20 +- lapack-netlib/SRC/dlasyf_aa.f | 42 +- lapack-netlib/SRC/dlasyf_rk.f | 4 +- lapack-netlib/SRC/dlasyf_rook.f | 2 +- lapack-netlib/SRC/dlatdf.f | 4 +- lapack-netlib/SRC/dlatsqr.f | 23 +- lapack-netlib/SRC/dorgtsqr.f | 306 + lapack-netlib/SRC/dorhr_col.f | 440 + lapack-netlib/SRC/dporfsx.f | 12 +- lapack-netlib/SRC/dposvxx.f | 10 +- lapack-netlib/SRC/dsb2st_kernels.f | 70 +- lapack-netlib/SRC/dsbgvx.f | 6 +- lapack-netlib/SRC/dsgesv.f | 10 +- lapack-netlib/SRC/dsposv.f | 6 +- lapack-netlib/SRC/dstemr.f | 4 +- lapack-netlib/SRC/dsyconvf.f | 8 +- lapack-netlib/SRC/dsyconvf_rook.f | 8 +- lapack-netlib/SRC/dsyev_2stage.f | 2 +- lapack-netlib/SRC/dsyevd_2stage.f | 2 +- lapack-netlib/SRC/dsyrfsx.f | 10 +- lapack-netlib/SRC/dsysv_aa.f | 6 +- lapack-netlib/SRC/dsysv_aa_2stage.f | 4 +- lapack-netlib/SRC/dsysvxx.f | 10 +- lapack-netlib/SRC/dsytf2_rk.f | 4 +- lapack-netlib/SRC/dsytrd_2stage.f | 13 +- lapack-netlib/SRC/dsytrd_sb2st.F | 4 +- lapack-netlib/SRC/dsytrd_sy2sb.f | 2 +- lapack-netlib/SRC/dsytrf.f | 6 +- lapack-netlib/SRC/dsytrf_aa.f | 8 +- lapack-netlib/SRC/dsytrf_aa_2stage.f | 58 +- lapack-netlib/SRC/dsytri2.f | 4 +- lapack-netlib/SRC/dsytrs_aa.f | 112 +- lapack-netlib/SRC/dsytrs_aa_2stage.f | 18 +- lapack-netlib/SRC/dtgsy2.f | 4 +- lapack-netlib/SRC/dtgsyl.f | 14 +- lapack-netlib/SRC/dtpmlqt.f | 2 +- lapack-netlib/SRC/dtpmqrt.f | 2 +- lapack-netlib/SRC/dtprfb.f | 4 +- lapack-netlib/SRC/ilaenv.f | 17 +- lapack-netlib/SRC/ilaenv2stage.f | 4 +- lapack-netlib/SRC/iparam2stage.F | 10 +- lapack-netlib/SRC/iparmq.f | 6 +- lapack-netlib/SRC/meson.build | 11 + lapack-netlib/SRC/sbdsvdx.f | 2 +- lapack-netlib/SRC/scombssq.f | 92 + lapack-netlib/SRC/sgbrfsx.f | 10 +- lapack-netlib/SRC/sgbsvxx.f | 10 +- lapack-netlib/SRC/sgebak.f | 8 +- lapack-netlib/SRC/sgeesx.f | 4 +- lapack-netlib/SRC/sgejsv.f | 54 +- lapack-netlib/SRC/sgelq.f | 19 +- lapack-netlib/SRC/sgelq2.f | 18 +- lapack-netlib/SRC/sgelqf.f | 16 +- lapack-netlib/SRC/sgelqt.f | 2 + lapack-netlib/SRC/sgelqt3.f | 2 + lapack-netlib/SRC/sgemlq.f | 3 +- lapack-netlib/SRC/sgemlqt.f | 2 + lapack-netlib/SRC/sgemqr.f | 3 +- lapack-netlib/SRC/sgeqr.f | 20 +- lapack-netlib/SRC/sgeqr2.f | 19 +- lapack-netlib/SRC/sgeqr2p.f | 20 +- lapack-netlib/SRC/sgeqrf.f | 17 +- lapack-netlib/SRC/sgeqrfp.f | 20 +- lapack-netlib/SRC/sgerfsx.f | 10 +- lapack-netlib/SRC/sgesc2.f | 4 +- lapack-netlib/SRC/sgesdd.f | 4 +- lapack-netlib/SRC/sgesvdq.f | 1388 ++ lapack-netlib/SRC/sgesvj.f | 44 +- lapack-netlib/SRC/sgesvxx.f | 10 +- lapack-netlib/SRC/sgetc2.f | 2 +- lapack-netlib/SRC/sgetsls.f | 4 +- lapack-netlib/SRC/sggesx.f | 8 +- lapack-netlib/SRC/sgsvj0.f | 20 +- lapack-netlib/SRC/sgsvj1.f | 20 +- lapack-netlib/SRC/shseqr.f | 46 +- lapack-netlib/SRC/sla_gbrcond.f | 4 +- lapack-netlib/SRC/sla_gbrfsx_extended.f | 12 +- lapack-netlib/SRC/sla_gercond.f | 4 +- lapack-netlib/SRC/sla_gerfsx_extended.f | 12 +- lapack-netlib/SRC/sla_porcond.f | 4 +- lapack-netlib/SRC/sla_porfsx_extended.f | 8 +- lapack-netlib/SRC/sla_syrcond.f | 4 +- lapack-netlib/SRC/sla_syrfsx_extended.f | 8 +- lapack-netlib/SRC/sla_syrpvgrw.f | 2 +- lapack-netlib/SRC/sla_wwaddw.f | 2 +- lapack-netlib/SRC/slaed4.f | 2 +- lapack-netlib/SRC/slaed8.f | 2 +- lapack-netlib/SRC/slagtf.f | 6 +- lapack-netlib/SRC/slagts.f | 20 +- lapack-netlib/SRC/slahqr.f | 14 +- lapack-netlib/SRC/slaln2.f | 2 +- lapack-netlib/SRC/slamswlq.f | 1 + lapack-netlib/SRC/slamtsqr.f | 1 + lapack-netlib/SRC/slangb.f | 26 +- lapack-netlib/SRC/slange.f | 22 +- lapack-netlib/SRC/slanhs.f | 25 +- lapack-netlib/SRC/slansb.f | 44 +- lapack-netlib/SRC/slansp.f | 48 +- lapack-netlib/SRC/slansy.f | 42 +- lapack-netlib/SRC/slantb.f | 57 +- lapack-netlib/SRC/slantp.f | 55 +- lapack-netlib/SRC/slantr.f | 58 +- lapack-netlib/SRC/slanv2.f | 8 +- lapack-netlib/SRC/slaorhr_col_getrfnp.f | 248 + lapack-netlib/SRC/slaorhr_col_getrfnp2.f | 305 + lapack-netlib/SRC/slaqps.f | 2 +- lapack-netlib/SRC/slaqr0.f | 38 +- lapack-netlib/SRC/slaqr1.f | 2 +- lapack-netlib/SRC/slaqr2.f | 18 +- lapack-netlib/SRC/slaqr3.f | 18 +- lapack-netlib/SRC/slaqr4.f | 38 +- lapack-netlib/SRC/slaqr5.f | 52 +- lapack-netlib/SRC/slarfb.f | 2 + lapack-netlib/SRC/slarfx.f | 2 +- lapack-netlib/SRC/slarfy.f | 2 +- lapack-netlib/SRC/slarrb.f | 4 +- lapack-netlib/SRC/slarre.f | 2 +- lapack-netlib/SRC/slarrj.f | 2 +- lapack-netlib/SRC/slarrv.f | 2 +- lapack-netlib/SRC/slasd7.f | 2 +- lapack-netlib/SRC/slassq.f | 2 +- lapack-netlib/SRC/slaswlq.f | 22 +- lapack-netlib/SRC/slasyf_aa.f | 42 +- lapack-netlib/SRC/slasyf_rk.f | 4 +- lapack-netlib/SRC/slatdf.f | 4 +- lapack-netlib/SRC/slatsqr.f | 23 +- lapack-netlib/SRC/sorgtsqr.f | 306 + lapack-netlib/SRC/sorhr_col.f | 439 + lapack-netlib/SRC/sporfsx.f | 12 +- lapack-netlib/SRC/sposvxx.f | 10 +- lapack-netlib/SRC/ssb2st_kernels.f | 70 +- lapack-netlib/SRC/ssbgvx.f | 6 +- lapack-netlib/SRC/sstemr.f | 4 +- lapack-netlib/SRC/ssyconvf.f | 8 +- lapack-netlib/SRC/ssyconvf_rook.f | 8 +- lapack-netlib/SRC/ssyev_2stage.f | 2 +- lapack-netlib/SRC/ssyevd_2stage.f | 2 +- lapack-netlib/SRC/ssyrfsx.f | 10 +- lapack-netlib/SRC/ssysv_aa.f | 6 +- lapack-netlib/SRC/ssysv_aa_2stage.f | 4 +- lapack-netlib/SRC/ssysvxx.f | 10 +- lapack-netlib/SRC/ssytf2_rk.f | 4 +- lapack-netlib/SRC/ssytrd_2stage.f | 13 +- lapack-netlib/SRC/ssytrd_sb2st.F | 4 +- lapack-netlib/SRC/ssytrd_sy2sb.f | 2 +- lapack-netlib/SRC/ssytrf.f | 6 +- lapack-netlib/SRC/ssytrf_aa.f | 8 +- lapack-netlib/SRC/ssytrf_aa_2stage.f | 26 +- lapack-netlib/SRC/ssytri2.f | 4 +- lapack-netlib/SRC/ssytrs_aa.f | 128 +- lapack-netlib/SRC/ssytrs_aa_2stage.f | 18 +- lapack-netlib/SRC/stgsy2.f | 4 +- lapack-netlib/SRC/stgsyl.f | 14 +- lapack-netlib/SRC/stpmlqt.f | 2 +- lapack-netlib/SRC/stpmqrt.f | 2 +- lapack-netlib/SRC/stprfb.f | 4 +- lapack-netlib/SRC/zcgesv.f | 10 +- lapack-netlib/SRC/zcposv.f | 6 +- lapack-netlib/SRC/zgbrfsx.f | 12 +- lapack-netlib/SRC/zgbsvxx.f | 10 +- lapack-netlib/SRC/zgebak.f | 8 +- lapack-netlib/SRC/zgeev.f | 2 +- lapack-netlib/SRC/zgejsv.f | 64 +- lapack-netlib/SRC/zgelq.f | 19 +- lapack-netlib/SRC/zgelq2.f | 18 +- lapack-netlib/SRC/zgelqf.f | 16 +- lapack-netlib/SRC/zgemlq.f | 3 +- lapack-netlib/SRC/zgemqr.f | 3 +- lapack-netlib/SRC/zgeqr.f | 20 +- lapack-netlib/SRC/zgeqr2.f | 19 +- lapack-netlib/SRC/zgeqr2p.f | 20 +- lapack-netlib/SRC/zgeqrf.f | 17 +- lapack-netlib/SRC/zgeqrfp.f | 20 +- lapack-netlib/SRC/zgerfsx.f | 12 +- lapack-netlib/SRC/zgesc2.f | 2 +- lapack-netlib/SRC/zgesvdq.f | 1389 ++ lapack-netlib/SRC/zgesvdx.f | 2 +- lapack-netlib/SRC/zgesvj.f | 46 +- lapack-netlib/SRC/zgesvxx.f | 10 +- lapack-netlib/SRC/zgetsls.f | 2 + lapack-netlib/SRC/zggesx.f | 8 +- lapack-netlib/SRC/zgsvj0.f | 18 +- lapack-netlib/SRC/zgsvj1.f | 20 +- lapack-netlib/SRC/zhb2st_kernels.f | 70 +- lapack-netlib/SRC/zhecon_3.f | 9 +- lapack-netlib/SRC/zheevr.f | 2 +- lapack-netlib/SRC/zheevr_2stage.f | 2 +- lapack-netlib/SRC/zhegs2.f | 1 + lapack-netlib/SRC/zhegst.f | 1 + lapack-netlib/SRC/zherfsx.f | 12 +- lapack-netlib/SRC/zhesv_aa.f | 6 +- lapack-netlib/SRC/zhesv_aa_2stage.f | 8 +- lapack-netlib/SRC/zhesvxx.f | 16 +- lapack-netlib/SRC/zhetf2_rk.f | 4 +- lapack-netlib/SRC/zhetrd_2stage.f | 13 +- lapack-netlib/SRC/zhetrd_hb2st.F | 4 +- lapack-netlib/SRC/zhetrd_he2hb.f | 2 +- lapack-netlib/SRC/zhetrf_aa.f | 8 +- lapack-netlib/SRC/zhetrf_aa_2stage.f | 42 +- lapack-netlib/SRC/zhetri2.f | 4 +- lapack-netlib/SRC/zhetrs_aa.f | 124 +- lapack-netlib/SRC/zhetrs_aa_2stage.f | 30 +- lapack-netlib/SRC/zhseqr.f | 44 +- lapack-netlib/SRC/zla_gbrcond_c.f | 4 +- lapack-netlib/SRC/zla_gbrcond_x.f | 4 +- lapack-netlib/SRC/zla_gbrfsx_extended.f | 12 +- lapack-netlib/SRC/zla_gercond_c.f | 8 +- lapack-netlib/SRC/zla_gercond_x.f | 4 +- lapack-netlib/SRC/zla_gerfsx_extended.f | 12 +- lapack-netlib/SRC/zla_hercond_c.f | 4 +- lapack-netlib/SRC/zla_hercond_x.f | 4 +- lapack-netlib/SRC/zla_herfsx_extended.f | 8 +- lapack-netlib/SRC/zla_herpvgrw.f | 2 +- lapack-netlib/SRC/zla_porcond_c.f | 4 +- lapack-netlib/SRC/zla_porcond_x.f | 4 +- lapack-netlib/SRC/zla_porfsx_extended.f | 8 +- lapack-netlib/SRC/zla_porpvgrw.f | 2 +- lapack-netlib/SRC/zla_syrcond_c.f | 4 +- lapack-netlib/SRC/zla_syrcond_x.f | 4 +- lapack-netlib/SRC/zla_syrfsx_extended.f | 8 +- lapack-netlib/SRC/zla_syrpvgrw.f | 2 +- lapack-netlib/SRC/zla_wwaddw.f | 2 +- lapack-netlib/SRC/zlahef_aa.f | 42 +- lapack-netlib/SRC/zlahef_rk.f | 4 +- lapack-netlib/SRC/zlahqr.f | 14 +- lapack-netlib/SRC/zlamswlq.f | 1 + lapack-netlib/SRC/zlamtsqr.f | 1 + lapack-netlib/SRC/zlangb.f | 23 +- lapack-netlib/SRC/zlange.f | 22 +- lapack-netlib/SRC/zlanhb.f | 48 +- lapack-netlib/SRC/zlanhe.f | 45 +- lapack-netlib/SRC/zlanhp.f | 46 +- lapack-netlib/SRC/zlanhs.f | 23 +- lapack-netlib/SRC/zlansb.f | 42 +- lapack-netlib/SRC/zlansp.f | 54 +- lapack-netlib/SRC/zlansy.f | 40 +- lapack-netlib/SRC/zlantb.f | 55 +- lapack-netlib/SRC/zlantp.f | 53 +- lapack-netlib/SRC/zlantr.f | 56 +- lapack-netlib/SRC/zlaqps.f | 2 +- lapack-netlib/SRC/zlaqr0.f | 34 +- lapack-netlib/SRC/zlaqr1.f | 2 +- lapack-netlib/SRC/zlaqr2.f | 18 +- lapack-netlib/SRC/zlaqr3.f | 18 +- lapack-netlib/SRC/zlaqr4.f | 32 +- lapack-netlib/SRC/zlaqr5.f | 52 +- lapack-netlib/SRC/zlarfb.f | 2 + lapack-netlib/SRC/zlarfx.f | 2 +- lapack-netlib/SRC/zlarfy.f | 2 +- lapack-netlib/SRC/zlarrv.f | 2 +- lapack-netlib/SRC/zlassq.f | 4 +- lapack-netlib/SRC/zlaswlq.f | 20 +- lapack-netlib/SRC/zlasyf_aa.f | 42 +- lapack-netlib/SRC/zlasyf_rk.f | 4 +- lapack-netlib/SRC/zlatdf.f | 2 +- lapack-netlib/SRC/zlatsqr.f | 25 +- lapack-netlib/SRC/zlaunhr_col_getrfnp.f | 248 + lapack-netlib/SRC/zlaunhr_col_getrfnp2.f | 314 + lapack-netlib/SRC/zporfsx.f | 16 +- lapack-netlib/SRC/zposvxx.f | 14 +- lapack-netlib/SRC/zpotrf2.f | 4 +- lapack-netlib/SRC/zstemr.f | 4 +- lapack-netlib/SRC/zsycon_3.f | 9 +- lapack-netlib/SRC/zsyconvf.f | 8 +- lapack-netlib/SRC/zsyconvf_rook.f | 8 +- lapack-netlib/SRC/zsyrfsx.f | 10 +- lapack-netlib/SRC/zsysv_aa.f | 6 +- lapack-netlib/SRC/zsysv_aa_2stage.f | 6 +- lapack-netlib/SRC/zsysvxx.f | 10 +- lapack-netlib/SRC/zsytf2_rk.f | 4 +- lapack-netlib/SRC/zsytrf.f | 2 +- lapack-netlib/SRC/zsytrf_aa.f | 8 +- lapack-netlib/SRC/zsytrf_aa_2stage.f | 26 +- lapack-netlib/SRC/zsytri2.f | 4 +- lapack-netlib/SRC/zsytrs2.f | 2 +- lapack-netlib/SRC/zsytrs_aa.f | 112 +- lapack-netlib/SRC/zsytrs_aa_2stage.f | 18 +- lapack-netlib/SRC/ztgsy2.f | 4 +- lapack-netlib/SRC/ztpmlqt.f | 2 +- lapack-netlib/SRC/ztpmqrt.f | 2 +- lapack-netlib/SRC/ztprfb.f | 4 +- lapack-netlib/SRC/zungtsqr.f | 307 + lapack-netlib/SRC/zunhr_col.f | 441 + lapack-netlib/TESTING/CMakeLists.txt | 2 +- lapack-netlib/TESTING/EIG/Makefile | 35 +- lapack-netlib/TESTING/EIG/cbdt05.f | 1 + lapack-netlib/TESTING/EIG/cchkst.f | 2 +- lapack-netlib/TESTING/EIG/cchkst2stg.f | 2 +- lapack-netlib/TESTING/EIG/cdrgsx.f | 2 +- lapack-netlib/TESTING/EIG/cdrvbd.f | 199 +- lapack-netlib/TESTING/EIG/cerred.f | 59 +- lapack-netlib/TESTING/EIG/cget51.f | 17 +- lapack-netlib/TESTING/EIG/chbt21.f | 12 +- lapack-netlib/TESTING/EIG/chet21.f | 32 +- lapack-netlib/TESTING/EIG/chet22.f | 12 +- lapack-netlib/TESTING/EIG/chpt21.f | 35 +- lapack-netlib/TESTING/EIG/cstt21.f | 13 +- lapack-netlib/TESTING/EIG/dbdt05.f | 1 + lapack-netlib/TESTING/EIG/dchkst.f | 2 +- lapack-netlib/TESTING/EIG/dchkst2stg.f | 2 +- lapack-netlib/TESTING/EIG/ddrgsx.f | 2 +- lapack-netlib/TESTING/EIG/ddrvbd.f | 113 +- lapack-netlib/TESTING/EIG/derred.f | 59 +- lapack-netlib/TESTING/EIG/dget39.f | 2 +- lapack-netlib/TESTING/EIG/dsbt21.f | 11 +- lapack-netlib/TESTING/EIG/dspt21.f | 32 +- lapack-netlib/TESTING/EIG/dsyt21.f | 30 +- lapack-netlib/TESTING/EIG/dsyt22.f | 12 +- lapack-netlib/TESTING/EIG/sbdt05.f | 1 + lapack-netlib/TESTING/EIG/schkst.f | 2 +- lapack-netlib/TESTING/EIG/schkst2stg.f | 2 +- lapack-netlib/TESTING/EIG/sdrgsx.f | 2 +- lapack-netlib/TESTING/EIG/sdrvbd.f | 111 +- lapack-netlib/TESTING/EIG/serred.f | 59 +- lapack-netlib/TESTING/EIG/sget39.f | 2 +- lapack-netlib/TESTING/EIG/ssbt21.f | 11 +- lapack-netlib/TESTING/EIG/sspt21.f | 32 +- lapack-netlib/TESTING/EIG/ssyt21.f | 30 +- lapack-netlib/TESTING/EIG/ssyt22.f | 12 +- lapack-netlib/TESTING/EIG/zbdt05.f | 1 + lapack-netlib/TESTING/EIG/zchkst.f | 2 +- lapack-netlib/TESTING/EIG/zchkst2stg.f | 2 +- lapack-netlib/TESTING/EIG/zdrgev3.f | 2 +- lapack-netlib/TESTING/EIG/zdrgsx.f | 2 +- lapack-netlib/TESTING/EIG/zdrvbd.f | 196 +- lapack-netlib/TESTING/EIG/zerred.f | 59 +- lapack-netlib/TESTING/EIG/zget51.f | 17 +- lapack-netlib/TESTING/EIG/zhbt21.f | 12 +- lapack-netlib/TESTING/EIG/zhet21.f | 32 +- lapack-netlib/TESTING/EIG/zhet22.f | 12 +- lapack-netlib/TESTING/EIG/zhpt21.f | 36 +- lapack-netlib/TESTING/EIG/zstt21.f | 11 +- lapack-netlib/TESTING/LIN/CMakeLists.txt | 12 +- lapack-netlib/TESTING/LIN/Makefile | 75 +- lapack-netlib/TESTING/LIN/cchkaa.f | 40 +- lapack-netlib/TESTING/LIN/cchkunhr_col.f | 239 + lapack-netlib/TESTING/LIN/cdrvls.f | 25 +- lapack-netlib/TESTING/LIN/cdrvsy_rk.f | 3 +- lapack-netlib/TESTING/LIN/cerrunhr_col.f | 164 + lapack-netlib/TESTING/LIN/cerrvx.f | 4 +- lapack-netlib/TESTING/LIN/clahilb.f | 2 +- lapack-netlib/TESTING/LIN/ctsqr01.f | 26 +- lapack-netlib/TESTING/LIN/cunhr_col01.f | 390 + lapack-netlib/TESTING/LIN/dchkaa.f | 39 +- lapack-netlib/TESTING/LIN/dchkorhr_col.f | 239 + lapack-netlib/TESTING/LIN/ddrvls.f | 16 +- lapack-netlib/TESTING/LIN/derrorhr_col.f | 164 + lapack-netlib/TESTING/LIN/derrvx.f | 2 +- lapack-netlib/TESTING/LIN/dorhr_col01.f | 386 + lapack-netlib/TESTING/LIN/dtsqr01.f | 26 +- lapack-netlib/TESTING/LIN/schkaa.f | 35 +- lapack-netlib/TESTING/LIN/schkorhr_col.f | 239 + lapack-netlib/TESTING/LIN/sdrvls.f | 20 +- lapack-netlib/TESTING/LIN/serrorhr_col.f | 164 + lapack-netlib/TESTING/LIN/serrvx.f | 2 +- lapack-netlib/TESTING/LIN/sorhr_col01.f | 386 + lapack-netlib/TESTING/LIN/stsqr01.f | 26 +- lapack-netlib/TESTING/LIN/zchkaa.f | 43 +- lapack-netlib/TESTING/LIN/zchkunhr_col.f | 239 + lapack-netlib/TESTING/LIN/zdrvhe_rk.f | 1 + lapack-netlib/TESTING/LIN/zdrvls.f | 25 +- lapack-netlib/TESTING/LIN/zerrunhr_col.f | 164 + lapack-netlib/TESTING/LIN/zerrvx.f | 36 +- lapack-netlib/TESTING/LIN/zlahilb.f | 2 +- lapack-netlib/TESTING/LIN/ztsqr01.f | 26 +- lapack-netlib/TESTING/LIN/zunhr_col01.f | 390 + lapack-netlib/TESTING/MATGEN/Makefile | 43 +- lapack-netlib/TESTING/MATGEN/clahilb.f | 2 +- lapack-netlib/TESTING/MATGEN/clatm2.f | 2 +- lapack-netlib/TESTING/MATGEN/clatm3.f | 2 +- lapack-netlib/TESTING/MATGEN/clatmr.f | 30 +- lapack-netlib/TESTING/MATGEN/dlatm2.f | 2 +- lapack-netlib/TESTING/MATGEN/dlatm3.f | 2 +- lapack-netlib/TESTING/MATGEN/dlatmr.f | 30 +- lapack-netlib/TESTING/MATGEN/slatm2.f | 2 +- lapack-netlib/TESTING/MATGEN/slatm3.f | 2 +- lapack-netlib/TESTING/MATGEN/slatmr.f | 30 +- lapack-netlib/TESTING/MATGEN/zlahilb.f | 2 +- lapack-netlib/TESTING/MATGEN/zlatm2.f | 2 +- lapack-netlib/TESTING/MATGEN/zlatm3.f | 2 +- lapack-netlib/TESTING/MATGEN/zlatmr.f | 30 +- lapack-netlib/TESTING/Makefile | 188 +- lapack-netlib/TESTING/ctest.in | 1 + lapack-netlib/TESTING/dtest.in | 1 + lapack-netlib/TESTING/stest.in | 1 + lapack-netlib/TESTING/ztest.in | 1 + lapack-netlib/appveyor.yml | 64 - lapack-netlib/lapack_build.cmake | 8 +- lapack-netlib/lapack_testing.py | 16 +- lapack-netlib/make.inc.example | 34 +- lapack-netlib/meson.build | 28 + lapack-netlib/meson_options.txt | 3 + 812 files changed, 36335 insertions(+), 11964 deletions(-) create mode 100644 lapack-netlib/.appveyor.yml create mode 100644 lapack-netlib/BLAS/SRC/meson.build create mode 100644 lapack-netlib/LAPACKE/include/lapack.h create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c create mode 100644 lapack-netlib/SRC/cgesvdq.f create mode 100644 lapack-netlib/SRC/claunhr_col_getrfnp.f create mode 100644 lapack-netlib/SRC/claunhr_col_getrfnp2.f create mode 100644 lapack-netlib/SRC/cungtsqr.f create mode 100644 lapack-netlib/SRC/cunhr_col.f create mode 100644 lapack-netlib/SRC/dcombssq.f create mode 100644 lapack-netlib/SRC/dgesvdq.f create mode 100644 lapack-netlib/SRC/dlaorhr_col_getrfnp.f create mode 100644 lapack-netlib/SRC/dlaorhr_col_getrfnp2.f create mode 100644 lapack-netlib/SRC/dorgtsqr.f create mode 100644 lapack-netlib/SRC/dorhr_col.f create mode 100644 lapack-netlib/SRC/meson.build create mode 100644 lapack-netlib/SRC/scombssq.f create mode 100644 lapack-netlib/SRC/sgesvdq.f create mode 100644 lapack-netlib/SRC/slaorhr_col_getrfnp.f create mode 100644 lapack-netlib/SRC/slaorhr_col_getrfnp2.f create mode 100644 lapack-netlib/SRC/sorgtsqr.f create mode 100644 lapack-netlib/SRC/sorhr_col.f create mode 100644 lapack-netlib/SRC/zgesvdq.f create mode 100644 lapack-netlib/SRC/zlaunhr_col_getrfnp.f create mode 100644 lapack-netlib/SRC/zlaunhr_col_getrfnp2.f create mode 100644 lapack-netlib/SRC/zungtsqr.f create mode 100644 lapack-netlib/SRC/zunhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/cchkunhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/cerrunhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/cunhr_col01.f create mode 100644 lapack-netlib/TESTING/LIN/dchkorhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/derrorhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/dorhr_col01.f create mode 100644 lapack-netlib/TESTING/LIN/schkorhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/serrorhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/sorhr_col01.f create mode 100644 lapack-netlib/TESTING/LIN/zchkunhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/zerrunhr_col.f create mode 100644 lapack-netlib/TESTING/LIN/zunhr_col01.f delete mode 100644 lapack-netlib/appveyor.yml create mode 100644 lapack-netlib/meson.build create mode 100644 lapack-netlib/meson_options.txt diff --git a/Makefile b/Makefile index 60f189ef2..a22e16bab 100644 --- a/Makefile +++ b/Makefile @@ -247,21 +247,21 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -319,7 +319,7 @@ lapack-test : ifneq ($(CROSS), 1) ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) - (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) + (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) endif lapack-runtest: diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index d1d2cdd3b..18a74d18e 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -115,7 +115,9 @@ set(SLASRC stplqt.f stplqt2.f stpmlqt.f ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f - ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f) + ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f + scombssq.f sgesvdq.f slaorhr_col_getrfnp.f + slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -210,7 +212,9 @@ set(CLASRC ctplqt.f ctplqt2.f ctpmlqt.f chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f - chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f) + chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f + cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f + cungtsqr.f cunhr_col.f ) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -299,7 +303,9 @@ set(DLASRC dtplqt.f dtplqt2.f dtpmlqt.f dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f - dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f) + dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f + dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f + dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f ) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -398,7 +404,9 @@ set(ZLASRC zgelq.f zlaswlq.f zlamswlq.f zgemlq.f zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f - zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f) + zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f + zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f + zungtsqr.f zunhr_col.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 0fc88b882..f10905c4d 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -715,6 +715,8 @@ set(DSRC lapacke_dgesv_work.c lapacke_dgesvd.c lapacke_dgesvd_work.c + lapacke_dgesvdq.c + lapacke_dgesvdq_work.c lapacke_dgesvdx.c lapacke_dgesvdx_work.c lapacke_dgesvj.c @@ -1287,6 +1289,8 @@ set(SSRC lapacke_sgesv_work.c lapacke_sgesvd.c lapacke_sgesvd_work.c + lapacke_sgesvdq.c + lapacke_sgesvdq_work.c lapacke_sgesvdx.c lapacke_sgesvdx_work.c lapacke_sgesvj.c @@ -1853,6 +1857,8 @@ set(ZSRC lapacke_zgesv_work.c lapacke_zgesvd.c lapacke_zgesvd_work.c + lapacke_zgesvdq.c + lapacke_zgesvdq_work.c lapacke_zgesvdx.c lapacke_zgesvdx_work.c lapacke_zgesvj.c diff --git a/exports/gensymbol b/exports/gensymbol index 37ba0b191..d2894e6c8 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -694,7 +694,19 @@ # functions added for lapack-3.8.0 - ilaenv2stage + ilaenv2stage, + + # functions added for lapack-3.9.0 + cgesvdq, + cungtsqr, + dcombssq, + dgesvdq, + dorgtsqr, + scombssq, + sgesvdq, + sorgtsqr, + zgesvdq, + zungtsqr ); @lapack_extendedprecision_objs = ( @@ -3347,6 +3359,15 @@ LAPACKE_zsytrf_aa_2stage_work, LAPACKE_zsytrs_aa_2stage, LAPACKE_zsytrs_aa_2stage_work, + + # new functions from 3.9.0 + LAPACKE_dgesvdq, + LAPACKE_dgesvdq_work, + LAPACKE_sgesvdq, + LAPACKE_sgesvdq_work, + LAPACKE_zgesvdq, + LAPACKE_zgesvdq_work + ); #These function may need 2 underscores. @@ -3419,7 +3440,13 @@ dsytrf_aa_2stage, dsytrs_aa_2stage, zhesv_aa_2stage, zhetrf_aa_2stage, zhetrs_aa_2stage, zsysv_aa_2stage, - zsytrf_aa_2stage, zsytrs_aa_2stage + zsytrf_aa_2stage, zsytrs_aa_2stage, +# 3.9.0 + claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, + dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, + slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, + zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col + ); diff --git a/lapack-netlib/.appveyor.yml b/lapack-netlib/.appveyor.yml new file mode 100644 index 000000000..0c16dcf7b --- /dev/null +++ b/lapack-netlib/.appveyor.yml @@ -0,0 +1,38 @@ +image: +- Visual Studio 2017 + +configuration: Release +clone_depth: 3 + +matrix: + fast_finish: false + +skip_commits: +# Add [av skip] to commit messages + message: /\[av skip\]/ + +cache: + - '%APPVEYOR_BUILD_FOLDER%\build' + +environment: + global: + CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 + +install: + - call %CONDA_INSTALL_LOCN%\Scripts\activate.bat + - conda config --add channels conda-forge --force + - conda install --yes --quiet flang jom + - call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 + - set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" + - set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" + +before_build: + - ps: if (-Not (Test-Path .\build)) { mkdir build } + - cd build + - cmake -G "NMake Makefiles JOM" -DCMAKE_Fortran_COMPILER=flang -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=ON .. + +build_script: + - cmake --build . + +test_script: + - ctest -j2 diff --git a/lapack-netlib/.gitignore b/lapack-netlib/.gitignore index 4ac90962e..015f09d77 100644 --- a/lapack-netlib/.gitignore +++ b/lapack-netlib/.gitignore @@ -35,3 +35,9 @@ LAPACKE/example/xexample* # SED SRC/*-e LAPACKE/src/*-e +build* + +# DOCS documentation +DOCS/man +DOCS/explore-html +output_err diff --git a/lapack-netlib/.travis.yml b/lapack-netlib/.travis.yml index 68cfa607a..04369dafb 100644 --- a/lapack-netlib/.travis.yml +++ b/lapack-netlib/.travis.yml @@ -1,33 +1,32 @@ -language: cpp +language: c +dist: xenial +group: travis_latest + +git: + depth: 3 + quiet: true addons: apt: - sources: - - george-edison55-precise-backports # cmake packages: - - cmake - - cmake-data - - gfortran - -os: - - linux - - osx - -env: - - CMAKE_BUILD_TYPE=Release - - CMAKE_BUILD_TYPE=Coverage + - gfortran -install: - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; - then - for pkg in gcc cmake; do - if brew list -1 | grep -q "^${pkg}\$"; then - brew outdated $pkg || brew upgrade $pkg; - else - brew install $pkg; - fi - done - fi +matrix: + include: + - os: linux + env: CMAKE_BUILD_TYPE=Release + - os: linux + env: CMAKE_BUILD_TYPE=Coverage + - os: osx + env: CMAKE_BUILD_TYPE=Release + before_install: + - brew update > /dev/null + - brew install gcc > /dev/null + - os: osx + env: CMAKE_BUILD_TYPE=Coverage + before_install: + - brew update > /dev/null + - brew install gcc > /dev/null script: - export PR=https://api.github.com/repos/$TRAVIS_REPO_SLUG/pulls/$TRAVIS_PULL_REQUEST diff --git a/lapack-netlib/BLAS/CMakeLists.txt b/lapack-netlib/BLAS/CMakeLists.txt index e122b2b33..ee5676fc6 100644 --- a/lapack-netlib/BLAS/CMakeLists.txt +++ b/lapack-netlib/BLAS/CMakeLists.txt @@ -6,4 +6,5 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR install(FILES ${CMAKE_CURRENT_BINARY_DIR}/blas.pc DESTINATION ${PKG_CONFIG_DIR} + COMPONENT Development ) diff --git a/lapack-netlib/BLAS/Makefile b/lapack-netlib/BLAS/Makefile index f9c4b534c..088ea5d50 100644 --- a/lapack-netlib/BLAS/Makefile +++ b/lapack-netlib/BLAS/Makefile @@ -1,13 +1,18 @@ -include ../make.inc +TOPSRCDIR = .. +include $(TOPSRCDIR)/make.inc +.PHONY: all all: blas +.PHONY: blas blas: $(MAKE) -C SRC +.PHONY: blas_testing blas_testing: blas $(MAKE) -C TESTING run +.PHONY: clean cleanobj cleanlib cleanexe cleantest clean: $(MAKE) -C SRC clean $(MAKE) -C TESTING clean diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile index a436365aa..66bb96421 100644 --- a/lapack-netlib/BLAS/SRC/Makefile +++ b/lapack-netlib/BLAS/SRC/Makefile @@ -1,5 +1,3 @@ -include ../../make.inc - ####################################################################### # This is the makefile to create a library for the BLAS. # The files are grouped as follows: @@ -55,6 +53,10 @@ include ../../make.inc # ####################################################################### +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc + +.PHONY: all all: $(BLASLIB) #--------------------------------------------------------- @@ -138,33 +140,32 @@ ALLOBJ = $(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \ $(ZBLAS2) $(ZBLAS3) $(ALLBLAS) $(BLASLIB): $(ALLOBJ) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ +.PHONY: single double complex complex16 single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) FRC: @FRC=$(FRC) +.PHONY: clean cleanobj cleanlib clean: cleanobj cleanlib cleanobj: rm -f *.o cleanlib: #rm -f $(BLASLIB) # May point to a system lib, e.g. -lblas - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< diff --git a/lapack-netlib/BLAS/SRC/icamax.f b/lapack-netlib/BLAS/SRC/icamax.f index 8057ab095..02bc90ae4 100644 --- a/lapack-netlib/BLAS/SRC/icamax.f +++ b/lapack-netlib/BLAS/SRC/icamax.f @@ -43,7 +43,7 @@ *> \param[in] INCX *> \verbatim *> INCX is INTEGER -*> storage spacing between elements of SX +*> storage spacing between elements of CX *> \endverbatim * * Authors: diff --git a/lapack-netlib/BLAS/SRC/idamax.f b/lapack-netlib/BLAS/SRC/idamax.f index 7268534db..1578ea950 100644 --- a/lapack-netlib/BLAS/SRC/idamax.f +++ b/lapack-netlib/BLAS/SRC/idamax.f @@ -43,7 +43,7 @@ *> \param[in] INCX *> \verbatim *> INCX is INTEGER -*> storage spacing between elements of SX +*> storage spacing between elements of DX *> \endverbatim * * Authors: diff --git a/lapack-netlib/BLAS/SRC/izamax.f b/lapack-netlib/BLAS/SRC/izamax.f index 63d8e97de..c0aabb7bc 100644 --- a/lapack-netlib/BLAS/SRC/izamax.f +++ b/lapack-netlib/BLAS/SRC/izamax.f @@ -43,7 +43,7 @@ *> \param[in] INCX *> \verbatim *> INCX is INTEGER -*> storage spacing between elements of SX +*> storage spacing between elements of ZX *> \endverbatim * * Authors: diff --git a/lapack-netlib/BLAS/SRC/meson.build b/lapack-netlib/BLAS/SRC/meson.build new file mode 100644 index 000000000..8d96f2acd --- /dev/null +++ b/lapack-netlib/BLAS/SRC/meson.build @@ -0,0 +1,29 @@ +SBLAS1 = files('isamax.f', 'sasum.f', 'saxpy.f', 'scopy.f', 'sdot.f', 'snrm2.f', 'srot.f', 'srotg.f', 'sscal.f', 'sswap.f', 'sdsdot.f', 'srotmg.f', 'srotm.f') + +CBLAS1 = files('scabs1.f', 'scasum.f', 'scnrm2.f', 'icamax.f', 'caxpy.f', 'ccopy.f', 'cdotc.f', 'cdotu.f', 'csscal.f', 'crotg.f', 'cscal.f', 'cswap.f', 'csrot.f') + +DBLAS1 = files('idamax.f', 'dasum.f', 'daxpy.f', 'dcopy.f', 'ddot.f', 'dnrm2.f', 'drot.f', 'drotg.f', 'dscal.f', 'dsdot.f', 'dswap.f', 'drotmg.f', 'drotm.f') + +ZBLAS1 = files('dcabs1.f', 'dzasum.f', 'dznrm2.f', 'izamax.f', 'zaxpy.f', 'zcopy.f', 'zdotc.f', 'zdotu.f', 'zdscal.f', 'zrotg.f', 'zscal.f', 'zswap.f', 'zdrot.f') + +CB1AUX = files('isamax.f', 'sasum.f', 'saxpy.f', 'scopy.f', 'snrm2.f', 'sscal.f') + +ZB1AUX = files('idamax.f', 'dasum.f', 'daxpy.f', 'dcopy.f', 'dnrm2.f', 'dscal.f') + +ALLBLAS = files('lsame.f', 'xerbla.f', 'xerbla_array.f') + +SBLAS2 = files('sgemv.f', 'sgbmv.f', 'ssymv.f', 'ssbmv.f', 'sspmv.f', 'strmv.f', 'stbmv.f', 'stpmv.f', 'strsv.f', 'stbsv.f', 'stpsv.f', 'sger.f', 'ssyr.f', 'sspr.f', 'ssyr2.f', 'sspr2.f') + +CBLAS2 = files('cgemv.f', 'cgbmv.f', 'chemv.f', 'chbmv.f', 'chpmv.f', 'ctrmv.f', 'ctbmv.f', 'ctpmv.f', 'ctrsv.f', 'ctbsv.f', 'ctpsv.f', 'cgerc.f', 'cgeru.f', 'cher.f', 'chpr.f', 'cher2.f', 'chpr2.f') + +DBLAS2 = files('dgemv.f', 'dgbmv.f', 'dsymv.f', 'dsbmv.f', 'dspmv.f', 'dtrmv.f', 'dtbmv.f', 'dtpmv.f', 'dtrsv.f', 'dtbsv.f', 'dtpsv.f', 'dger.f', 'dsyr.f', 'dspr.f', 'dsyr2.f', 'dspr2.f') + +ZBLAS2 = files('zgemv.f', 'zgbmv.f', 'zhemv.f', 'zhbmv.f', 'zhpmv.f', 'ztrmv.f', 'ztbmv.f', 'ztpmv.f', 'ztrsv.f', 'ztbsv.f', 'ztpsv.f', 'zgerc.f', 'zgeru.f', 'zher.f', 'zhpr.f', 'zher2.f', 'zhpr2.f') + +SBLAS3 = files('sgemm.f', 'ssymm.f', 'ssyrk.f', 'ssyr2k.f', 'strmm.f', 'strsm.f') + +CBLAS3 = files('cgemm.f', 'csymm.f', 'csyrk.f', 'csyr2k.f', 'ctrmm.f', 'ctrsm.f', 'chemm.f', 'cherk.f', 'cher2k.f') + +DBLAS3 = files('dgemm.f', 'dsymm.f', 'dsyrk.f', 'dsyr2k.f', 'dtrmm.f', 'dtrsm.f') + +ZBLAS3 = files('zgemm.f', 'zsymm.f', 'zsyrk.f', 'zsyr2k.f', 'ztrmm.f', 'ztrsm.f', 'zhemm.f', 'zherk.f', 'zher2k.f') diff --git a/lapack-netlib/BLAS/SRC/sdsdot.f b/lapack-netlib/BLAS/SRC/sdsdot.f index a0ec32b6f..a491e6982 100644 --- a/lapack-netlib/BLAS/SRC/sdsdot.f +++ b/lapack-netlib/BLAS/SRC/sdsdot.f @@ -23,13 +23,13 @@ *> *> \verbatim *> -* Compute the inner product of two vectors with extended -* precision accumulation. -* -* Returns S.P. result with dot product accumulated in D.P. -* SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY), -* where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is -* defined in a similar way using INCY. +*> Compute the inner product of two vectors with extended +*> precision accumulation. +*> +*> Returns S.P. result with dot product accumulated in D.P. +*> SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY), +*> where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is +*> defined in a similar way using INCY. *> \endverbatim * * Arguments: @@ -77,7 +77,14 @@ *> \author Lawson, C. L., (JPL), Hanson, R. J., (SNLA), *> \author Kincaid, D. R., (U. of Texas), Krogh, F. T., (JPL) * -*> \ingroup complex_blas_level1 +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2017 +* +*> \ingroup single_blas_level1 * *> \par Further Details: * ===================== @@ -102,65 +109,7 @@ *> 920501 Reformatted the REFERENCES section. (WRB) *> 070118 Reformat to LAPACK coding style *> \endverbatim -* -* ===================================================================== -* -* .. Local Scalars .. -* DOUBLE PRECISION DSDOT -* INTEGER I,KX,KY,NS -* .. -* .. Intrinsic Functions .. -* INTRINSIC DBLE -* .. -* DSDOT = SB -* IF (N.LE.0) THEN -* SDSDOT = DSDOT -* RETURN -* END IF -* IF (INCX.EQ.INCY .AND. INCX.GT.0) THEN -* -* Code for equal and positive increments. -* -* NS = N*INCX -* DO I = 1,NS,INCX -* DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I)) -* END DO -* ELSE -* -* Code for unequal or nonpositive increments. -* -* KX = 1 -* KY = 1 -* IF (INCX.LT.0) KX = 1 + (1-N)*INCX -* IF (INCY.LT.0) KY = 1 + (1-N)*INCY -* DO I = 1,N -* DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY)) -* KX = KX + INCX -* KY = KY + INCY -* END DO -* END IF -* SDSDOT = DSDOT -* RETURN -* END -* -*> \par Purpose: -* ============= *> -*> \verbatim -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2017 -* -*> \ingroup single_blas_level1 -* * ===================================================================== REAL FUNCTION SDSDOT(N,SB,SX,INCX,SY,INCY) * @@ -175,71 +124,6 @@ * .. * .. Array Arguments .. REAL SX(*),SY(*) -* .. -* -* PURPOSE -* ======= -* -* Compute the inner product of two vectors with extended -* precision accumulation. -* -* Returns S.P. result with dot product accumulated in D.P. -* SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY), -* where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is -* defined in a similar way using INCY. -* -* AUTHOR -* ====== -* Lawson, C. L., (JPL), Hanson, R. J., (SNLA), -* Kincaid, D. R., (U. of Texas), Krogh, F. T., (JPL) -* -* ARGUMENTS -* ========= -* -* N (input) INTEGER -* number of elements in input vector(s) -* -* SB (input) REAL -* single precision scalar to be added to inner product -* -* SX (input) REAL array, dimension (N) -* single precision vector with N elements -* -* INCX (input) INTEGER -* storage spacing between elements of SX -* -* SY (input) REAL array, dimension (N) -* single precision vector with N elements -* -* INCY (input) INTEGER -* storage spacing between elements of SY -* -* SDSDOT (output) REAL -* single precision dot product (SB if N .LE. 0) -* -* Further Details -* =============== -* -* REFERENCES -* -* C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T. -* Krogh, Basic linear algebra subprograms for Fortran -* usage, Algorithm No. 539, Transactions on Mathematical -* Software 5, 3 (September 1979), pp. 308-323. -* -* REVISION HISTORY (YYMMDD) -* -* 791001 DATE WRITTEN -* 890531 Changed all specific intrinsics to generic. (WRB) -* 890831 Modified array declarations. (WRB) -* 890831 REVISION DATE from Version 3.2 -* 891214 Prologue converted to Version 4.0 format. (BAB) -* 920310 Corrected definition of LX in DESCRIPTION. (WRB) -* 920501 Reformatted the REFERENCES section. (WRB) -* 070118 Reformat to LAPACK coding style -* -* ===================================================================== -* * .. Local Scalars .. DOUBLE PRECISION DSDOT INTEGER I,KX,KY,NS diff --git a/lapack-netlib/BLAS/TESTING/Makefile b/lapack-netlib/BLAS/TESTING/Makefile index 97150b1a3..5b3b0d6ee 100644 --- a/lapack-netlib/BLAS/TESTING/Makefile +++ b/lapack-netlib/BLAS/TESTING/Makefile @@ -1,5 +1,7 @@ -include ../../make.inc +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc +.PHONY: all single double complex complex16 all: single double complex complex16 single: xblat1s xblat2s xblat3s double: xblat1d xblat2d xblat3d @@ -7,32 +9,33 @@ complex: xblat1c xblat2c xblat3c complex16: xblat1z xblat2z xblat3z xblat1s: sblat1.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat1d: dblat1.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat1c: cblat1.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat1z: zblat1.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat2s: sblat2.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat2d: dblat2.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat2c: cblat2.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat2z: zblat2.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat3s: sblat3.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat3d: dblat3.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat3c: cblat3.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xblat3z: zblat3.o $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +.PHONY: run run: all ./xblat1s > sblat1.out ./xblat1d > dblat1.out @@ -47,6 +50,7 @@ run: all ./xblat3c < cblat3.in ./xblat3z < zblat3.in +.PHONY: clean cleanobj cleanexe cleantest clean: cleanobj cleanexe cleantest cleanobj: rm -f *.o @@ -54,6 +58,3 @@ cleanexe: rm -f xblat* cleantest: rm -f *.out core - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< diff --git a/lapack-netlib/BLAS/TESTING/cblat1.f b/lapack-netlib/BLAS/TESTING/cblat1.f index 036dca3e0..ecf2a44cb 100644 --- a/lapack-netlib/BLAS/TESTING/cblat1.f +++ b/lapack-netlib/BLAS/TESTING/cblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/BLAS/TESTING/dblat1.f b/lapack-netlib/BLAS/TESTING/dblat1.f index f3255fef4..28af121cd 100644 --- a/lapack-netlib/BLAS/TESTING/dblat1.f +++ b/lapack-netlib/BLAS/TESTING/dblat1.f @@ -991,7 +991,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/BLAS/TESTING/sblat1.f b/lapack-netlib/BLAS/TESTING/sblat1.f index a5c1c6af6..fe05bbe87 100644 --- a/lapack-netlib/BLAS/TESTING/sblat1.f +++ b/lapack-netlib/BLAS/TESTING/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/BLAS/TESTING/zblat1.f b/lapack-netlib/BLAS/TESTING/zblat1.f index 4b0bcf884..2d7b88490 100644 --- a/lapack-netlib/BLAS/TESTING/zblat1.f +++ b/lapack-netlib/BLAS/TESTING/zblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/CBLAS/CMakeLists.txt b/lapack-netlib/CBLAS/CMakeLists.txt index d9fa24530..04c5ab795 100644 --- a/lapack-netlib/CBLAS/CMakeLists.txt +++ b/lapack-netlib/CBLAS/CMakeLists.txt @@ -12,8 +12,10 @@ FortranCInterface_HEADER(${LAPACK_BINARY_DIR}/include/cblas_mangling.h SYMBOL_NAMESPACE "F77_") if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND) message(WARNING "Reverting to pre-defined include/lapacke_mangling.h") - configure_file(include/lapacke_mangling_with_flags.h.in - ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h) + configure_file(include/lapacke_mangling_with_flags.h.in + ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h) + configure_file(include/cblas_mangling_with_flags.h.in + ${LAPACK_BINARY_DIR}/include/cblas_mangling.h) endif() include_directories(include ${LAPACK_BINARY_DIR}/include) @@ -28,7 +30,10 @@ endforeach() endmacro() append_subdir_files(CBLAS_INCLUDE "include") -install(FILES ${CBLAS_INCLUDE} ${LAPACK_BINARY_DIR}/include/cblas_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +install(FILES ${CBLAS_INCLUDE} ${LAPACK_BINARY_DIR}/include/cblas_mangling.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + COMPONENT Development + ) # -------------------------------------------------- if(BUILD_TESTING) @@ -45,7 +50,9 @@ endif() set(_cblas_config_install_guard_target "") if(ALL_TARGETS) install(EXPORT cblas-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}) + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} + COMPONENT Development + ) # Choose one of the cblas targets to use as a guard for # cblas-config.cmake to load targets from the install tree. list(GET ALL_TARGETS 0 _cblas_config_install_guard_target) @@ -82,4 +89,6 @@ install(FILES ) #install(EXPORT cblas-targets -# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}) +# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION} +# COMPONENT Development +# ) diff --git a/lapack-netlib/CBLAS/Makefile b/lapack-netlib/CBLAS/Makefile index 513e8fc82..6e199cdce 100644 --- a/lapack-netlib/CBLAS/Makefile +++ b/lapack-netlib/CBLAS/Makefile @@ -1,19 +1,25 @@ -include ../make.inc +TOPSRCDIR = .. +include $(TOPSRCDIR)/make.inc +.PHONY: all all: cblas +.PHONY: cblas cblas: include/cblas_mangling.h $(MAKE) -C src include/cblas_mangling.h: include/cblas_mangling_with_flags.h.in - cp $< $@ + cp include/cblas_mangling_with_flags.h.in $@ +.PHONY: cblas_testing cblas_testing: cblas $(MAKE) -C testing run +.PHONY: cblas_example cblas_example: cblas $(MAKE) -C examples +.PHONY: clean cleanobj cleanlib cleanexe cleantest clean: $(MAKE) -C src clean $(MAKE) -C testing clean diff --git a/lapack-netlib/CBLAS/examples/Makefile b/lapack-netlib/CBLAS/examples/Makefile index 664b8bc57..84acd6561 100644 --- a/lapack-netlib/CBLAS/examples/Makefile +++ b/lapack-netlib/CBLAS/examples/Makefile @@ -1,17 +1,21 @@ -include ../../make.inc +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc +.SUFFIXES: .c .o +.c.o: + $(CC) $(CFLAGS) -I../include -c -o $@ $< + +.PHONY: all all: cblas_ex1 cblas_ex2 cblas_ex1: cblas_example1.o $(CBLASLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ cblas_ex2: cblas_example2.o $(CBLASLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +.PHONY: clean cleanobj cleanexe clean: cleanobj cleanexe cleanobj: rm -f *.o cleanexe: rm -f cblas_ex1 cblas_ex2 - -.c.o: - $(CC) $(CFLAGS) -I../include -c -o $@ $< diff --git a/lapack-netlib/CBLAS/examples/cblas_example1.c b/lapack-netlib/CBLAS/examples/cblas_example1.c index c3acd554d..3d5ed330c 100644 --- a/lapack-netlib/CBLAS/examples/cblas_example1.c +++ b/lapack-netlib/CBLAS/examples/cblas_example1.c @@ -47,7 +47,7 @@ int main ( ) a[m*3+1] = 6; a[m*3+2] = 7; a[m*3+3] = 8; - /* The elemetns of x and y */ + /* The elements of x and y */ x[0] = 1; x[1] = 2; x[2] = 1; diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile index 6c0518ac7..7100568e4 100644 --- a/lapack-netlib/CBLAS/src/Makefile +++ b/lapack-netlib/CBLAS/src/Makefile @@ -1,7 +1,13 @@ # This Makefile compiles the CBLAS routines -include ../../make.inc +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc +.SUFFIXES: .c .o +.c.o: + $(CC) $(CFLAGS) -I../include -c -o $@ $< + +.PHONY: all all: $(CBLASLIB) # Error handling routines for level 2 & 3 @@ -43,24 +49,25 @@ zlev1 = cblas_zswap.o cblas_zscal.o cblas_zdscal.o cblas_zcopy.o \ # Common files for level 1 single precision sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o +.PHONY: slib1 dlib1 clib1 zlib1 # Single precision real slib1: $(slev1) $(sclev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib1: $(dlev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib1: $(clev1) $(sclev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib1: $(zlev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # @@ -95,24 +102,25 @@ zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \ cblas_ztpsv.o cblas_zgeru.o cblas_zgerc.o cblas_zher.o cblas_zher2.o \ cblas_zhpr.o cblas_zhpr2.o +.PHONY: slib2 dlib2 clib2 zlib2 # Single precision real slib2: $(slev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib2: $(dlev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib2: $(clev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib2: $(zlev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # @@ -141,24 +149,25 @@ zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o \ cblas_zher2k.o cblas_ztrmm.o cblas_ztrsm.o cblas_zsyrk.o \ cblas_zsyr2k.o +.PHONY: slib3 dlib3 clib3 zlib3 # Single precision real slib3: $(slev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib3: $(dlev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib3: $(clev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib3: $(zlev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) @@ -166,36 +175,33 @@ alev1 = $(slev1) $(dlev1) $(clev1) $(zlev1) $(sclev1) alev2 = $(slev2) $(dlev2) $(clev2) $(zlev2) alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3) +.PHONY: all1 all2 all3 # All level 1 all1: $(alev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All level 2 all2: $(alev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All level 3 all3: $(alev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All levels and precisions $(CBLASLIB): $(alev1) $(alev2) $(alev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ FRC: @FRC=$(FRC) +.PHONY: clean cleanobj cleanlib clean: cleanobj cleanlib cleanobj: rm -f *.o cleanlib: rm -f $(CBLASLIB) - -.c.o: - $(CC) $(CFLAGS) -I../include -c -o $@ $< -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< diff --git a/lapack-netlib/CBLAS/src/cblas_sgemm.c b/lapack-netlib/CBLAS/src/cblas_sgemm.c index c4a49a2db..51cd7d18a 100644 --- a/lapack-netlib/CBLAS/src/cblas_sgemm.c +++ b/lapack-netlib/CBLAS/src/cblas_sgemm.c @@ -91,7 +91,7 @@ void cblas_sgemm(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, else { cblas_xerbla(2, "cblas_sgemm", - "Illegal TransA setting, %d\n", TransA); + "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; diff --git a/lapack-netlib/CBLAS/testing/Makefile b/lapack-netlib/CBLAS/testing/Makefile index 0182c3e88..e3b615b41 100644 --- a/lapack-netlib/CBLAS/testing/Makefile +++ b/lapack-netlib/CBLAS/testing/Makefile @@ -2,7 +2,12 @@ # The Makefile compiles c wrappers and testers for CBLAS. # -include ../../make.inc +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc + +.SUFFIXES: .c .o +.c.o: + $(CC) $(CFLAGS) -I../include -c -o $@ $< # Archive files necessary to compile LIB = $(CBLASLIB) $(BLASLIB) @@ -27,6 +32,7 @@ ztestl1o = c_zblas1.o ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o +.PHONY: all all1 all2 all3 all: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 all2: xscblat2 xdcblat2 xccblat2 xzcblat2 @@ -38,37 +44,38 @@ all3: xscblat3 xdcblat3 xccblat3 xzcblat3 # Single real xscblat1: c_sblat1.o $(stestl1o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xscblat2: c_sblat2.o $(stestl2o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xscblat3: c_sblat3.o $(stestl3o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ # Double real xdcblat1: c_dblat1.o $(dtestl1o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdcblat2: c_dblat2.o $(dtestl2o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdcblat3: c_dblat3.o $(dtestl3o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ # Single complex xccblat1: c_cblat1.o $(ctestl1o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xccblat2: c_cblat2.o $(ctestl2o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xccblat3: c_cblat3.o $(ctestl3o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ # Double complex xzcblat1: c_zblat1.o $(ztestl1o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xzcblat2: c_zblat2.o $(ztestl2o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ xzcblat3: c_zblat3.o $(ztestl3o) $(LIB) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ # RUN TESTS +.PHONY: run run: all @echo "--> TESTING CBLAS 1 - SINGLE PRECISION REAL <--" @./xscblat1 > stest1.out @@ -95,6 +102,7 @@ run: all @echo "--> TESTING CBLAS 3 - DOUBLE PRECISION COMPLEX <--" @./xzcblat3 < zin3 > ztest3.out +.PHONY: clean cleanobj cleanexe cleantest clean: cleanobj cleanexe cleantest cleanobj: rm -f *.o @@ -102,9 +110,3 @@ cleanexe: rm -f x* cleantest: rm -f *.out core - -.SUFFIXES: .o .f .c -.c.o: - $(CC) $(CFLAGS) -I../include -c -o $@ $< -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< diff --git a/lapack-netlib/CBLAS/testing/c_cblat1.f b/lapack-netlib/CBLAS/testing/c_cblat1.f index c741ce506..1a123d74d 100644 --- a/lapack-netlib/CBLAS/testing/c_cblat1.f +++ b/lapack-netlib/CBLAS/testing/c_cblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/CBLAS/testing/c_dblat1.f b/lapack-netlib/CBLAS/testing/c_dblat1.f index c570a9140..4a71b4dcf 100644 --- a/lapack-netlib/CBLAS/testing/c_dblat1.f +++ b/lapack-netlib/CBLAS/testing/c_dblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/CBLAS/testing/c_sblat1.f b/lapack-netlib/CBLAS/testing/c_sblat1.f index 773787d6f..89902f12d 100644 --- a/lapack-netlib/CBLAS/testing/c_sblat1.f +++ b/lapack-netlib/CBLAS/testing/c_sblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/CBLAS/testing/c_zblat1.f b/lapack-netlib/CBLAS/testing/c_zblat1.f index 03753e782..cd0c8541d 100644 --- a/lapack-netlib/CBLAS/testing/c_zblat1.f +++ b/lapack-netlib/CBLAS/testing/c_zblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake index acc51629e..add0d1797 100644 --- a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake +++ b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake @@ -1,4 +1,4 @@ -# This module checks against various known compilers and thier respective +# This module checks against various known compilers and their respective # flags to determine any specific flags needing to be set. # # 1. If FPE traps are enabled either abort or disable them diff --git a/lapack-netlib/CMAKE/FindGcov.cmake b/lapack-netlib/CMAKE/FindGcov.cmake index 4807f903e..3d4c0a2a0 100644 --- a/lapack-netlib/CMAKE/FindGcov.cmake +++ b/lapack-netlib/CMAKE/FindGcov.cmake @@ -20,7 +20,7 @@ set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY}) get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) foreach (LANG ${ENABLED_LANGUAGES}) - # Gcov evaluation is dependend on the used compiler. Check gcov support for + # Gcov evaluation is dependent on the used compiler. Check gcov support for # each compiler that is used. If gcov binary was already found for this # compiler, do not try to find it again. if(NOT GCOV_${CMAKE_${LANG}_COMPILER_ID}_BIN) diff --git a/lapack-netlib/CMAKE/Findcodecov.cmake b/lapack-netlib/CMAKE/Findcodecov.cmake index 1f33b2c09..384064007 100644 --- a/lapack-netlib/CMAKE/Findcodecov.cmake +++ b/lapack-netlib/CMAKE/Findcodecov.cmake @@ -42,7 +42,7 @@ set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY}) get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) foreach (LANG ${ENABLED_LANGUAGES}) - # Coverage flags are not dependend on language, but the used compiler. So + # Coverage flags are not dependent on language, but the used compiler. So # instead of searching flags foreach language, search flags foreach compiler # used. set(COMPILER ${CMAKE_${LANG}_COMPILER_ID}) diff --git a/lapack-netlib/CMAKE/FortranMangling.cmake b/lapack-netlib/CMAKE/FortranMangling.cmake index d772dc9bb..734ab6f4c 100644 --- a/lapack-netlib/CMAKE/FortranMangling.cmake +++ b/lapack-netlib/CMAKE/FortranMangling.cmake @@ -24,7 +24,7 @@ message(STATUS "=========") set(F77_OUTPUT_EXE "/Fe" CACHE INTERNAL "Fortran compiler option for setting executable file name.") else() - # in other case, let user specify their fortran configrations. + # in other case, let user specify their fortran configurations. set(F77_OPTION_COMPILE "-c" CACHE STRING "Fortran compiler option for compiling without linking.") set(F77_OUTPUT_OBJ "-o" CACHE STRING diff --git a/lapack-netlib/CMAKE/lapack-config-build.cmake.in b/lapack-netlib/CMAKE/lapack-config-build.cmake.in index 1d084fe13..f7e041663 100644 --- a/lapack-netlib/CMAKE/lapack-config-build.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-build.cmake.in @@ -5,6 +5,10 @@ if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") endif() unset(_LAPACK_TARGET) +# Hint for project building against lapack +set(LAPACK_Fortran_COMPILER_ID "@CMAKE_Fortran_COMPILER_ID@") + # Report the blas and lapack raw or imported libraries. set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@") set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@") +set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES}) diff --git a/lapack-netlib/CMAKE/lapack-config-install.cmake.in b/lapack-netlib/CMAKE/lapack-config-install.cmake.in index 4e04f8711..3de7362ea 100644 --- a/lapack-netlib/CMAKE/lapack-config-install.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-install.cmake.in @@ -8,8 +8,12 @@ if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") endif() unset(_LAPACK_TARGET) +# Hint for project building against lapack +set(LAPACK_Fortran_COMPILER_ID "@CMAKE_Fortran_COMPILER_ID@") + # Report the blas and lapack raw or imported libraries. set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@") set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@") +set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES}) unset(_LAPACK_SELF_DIR) diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index caa0e7107..df43d91b1 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8.12) project(LAPACK Fortran C) set(LAPACK_MAJOR_VERSION 3) -set(LAPACK_MINOR_VERSION 8) +set(LAPACK_MINOR_VERSION 9) set(LAPACK_PATCH_VERSION 0) set( LAPACK_VERSION @@ -13,6 +13,9 @@ set( # Add the CMake directory for custon CMake modules set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH}) +# Export all symbols on Windows when building shared libraries +SET(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE) + # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting build type to 'Release' as none was specified.") @@ -21,8 +24,19 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "Coverage") endif() -string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER) -if(${CMAKE_BUILD_TYPE_UPPER} STREQUAL "COVERAGE") +# Coverage +set(_is_coverage_build 0) +set(_msg "Checking if build type is 'Coverage'") +message(STATUS "${_msg}") +if(NOT CMAKE_CONFIGURATION_TYPES) + string(TOLOWER ${CMAKE_BUILD_TYPE} _build_type_lc) + if(${_build_type_lc} STREQUAL "coverage") + set(_is_coverage_build 1) + endif() +endif() +message(STATUS "${_msg}: ${_is_coverage_build}") + +if(_is_coverage_build) message(STATUS "Adding coverage") find_package(codecov) endif() @@ -58,18 +72,18 @@ include(PreventInSourceBuilds) include(PreventInBuildInstalls) if(UNIX) - if("${CMAKE_Fortran_COMPILER}" MATCHES "ifort") - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") + if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) + list(APPEND CMAKE_Fortran_FLAGS "-fp-model strict") endif() - if("${CMAKE_Fortran_COMPILER}" MATCHES "xlf") - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none") + if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) + list(APPEND CMAKE_Fortran_FLAGS "-qnosave -qstrict=none") endif() # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}") endif() -if(CMAKE_Fortran_COMPILER_ID STREQUAL "Compaq") +if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq) if(WIN32) if(CMAKE_GENERATOR STREQUAL "NMake Makefiles") get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE) @@ -96,24 +110,16 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL "Compaq") endif() endif() -# Get Python -message(STATUS "Looking for Python greater than 2.6 - ${PYTHONINTERP_FOUND}") -find_package(PythonInterp 2.7) # lapack_testing.py uses features from python 2.7 and greater -if(PYTHONINTERP_FOUND) - message(STATUS "Using Python version ${PYTHON_VERSION_STRING}") -else() - message(STATUS "No suitable Python version found, so skipping summary tests.") -endif() -# -------------------------------------------------- +# -------------------------------------------------- set(LAPACK_INSTALL_EXPORT_NAME lapack-targets) macro(lapack_install_library lib) install(TARGETS ${lib} EXPORT ${LAPACK_INSTALL_EXPORT_NAME} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT Development + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT RuntimeLibraries + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT RuntimeLibraries ) endmacro() @@ -121,12 +127,22 @@ set(PKG_CONFIG_DIR ${CMAKE_INSTALL_LIBDIR}/pkgconfig) # -------------------------------------------------- # Testing -option(BUILD_TESTING "Build tests" OFF) -enable_testing() +option(BUILD_TESTING "Build tests" ${_is_coverage_build}) include(CTest) -enable_testing() message(STATUS "Build tests: ${BUILD_TESTING}") +# lapack_testing.py uses features from python 2.7 and greater +if(BUILD_TESTING) + set(_msg "Looking for Python >= 2.7 needed for summary tests") + message(STATUS "${_msg}") + find_package(PythonInterp 2.7 QUIET) + if(PYTHONINTERP_FOUND) + message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})") + else() + message(STATUS "${_msg} - not found (skipping summary tests)") + endif() +endif() + # -------------------------------------------------- # Organize output files. On Windows this also keeps .dll files next # to the .exe files that need them, making tests easy to run. @@ -299,16 +315,40 @@ if(LAPACKE) add_subdirectory(LAPACKE) endif() +#------------------------------------- +# BLAS++ / LAPACK++ +option(BLAS++ "Build BLAS++" OFF) +option(LAPACK++ "Build LAPACK++" OFF) + + +function(_display_cpp_implementation_msg name) + string(TOLOWER ${name} name_lc) + message(STATUS "${name}++ enable") + message(STATUS "----------------") + message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library") + message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.") + message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp") + message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu") + message(STATUS "----------------") +endfunction() +if(BLAS++) + _display_cpp_implementation_msg("BLAS") +endif() +if(LAPACK++) + _display_cpp_implementation_msg("LAPACK") +endif() + # -------------------------------------------------- # CPACK Packaging set(CPACK_PACKAGE_NAME "LAPACK") set(CPACK_PACKAGE_VENDOR "University of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "LAPACK- Linear Algebra Package") -set(CPACK_PACKAGE_VERSION_MAJOR 3) -set(CPACK_PACKAGE_VERSION_MINOR 5) -set(CPACK_PACKAGE_VERSION_PATCH 0) +set(CPACK_PACKAGE_VERSION_MAJOR ${LAPACK_MAJOR_VERSION}) +set(CPACK_PACKAGE_VERSION_MINOR ${LAPACK_MINOR_VERSION}) +set(CPACK_PACKAGE_VERSION_PATCH ${LAPACK_PATCH_VERSION}) set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") +set(CPACK_MONOLITHIC_INSTALL ON) set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK") if(WIN32 AND NOT UNIX) # There is a bug in NSI that does not handle full unix paths properly. Make @@ -347,7 +387,9 @@ endif() set(_lapack_config_install_guard_target "") if(ALL_TARGETS) install(EXPORT lapack-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION}) + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION} + COMPONENT Development + ) # Choose one of the lapack targets to use as a guard for # lapack-config.cmake to load targets from the install tree. @@ -382,6 +424,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_D install(FILES ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc DESTINATION ${PKG_CONFIG_DIR} + COMPONENT Development ) configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-install.cmake.in @@ -398,4 +441,6 @@ install(FILES ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake ${LAPACK_BINARY_DIR}/lapack-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION} + COMPONENT Development ) + diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile index 8f3558597..43cea43b5 100644 --- a/lapack-netlib/DOCS/Doxyfile +++ b/lapack-netlib/DOCS/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = LAPACK # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 3.8.0 +PROJECT_NUMBER = 3.9.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/lapack-netlib/DOCS/Doxyfile_man b/lapack-netlib/DOCS/Doxyfile_man index 6fb339a73..1767cf5f4 100644 --- a/lapack-netlib/DOCS/Doxyfile_man +++ b/lapack-netlib/DOCS/Doxyfile_man @@ -38,7 +38,7 @@ PROJECT_NAME = LAPACK # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 3.8.0 +PROJECT_NUMBER = 3.9.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex index 291735299..794c2a7aa 100644 --- a/lapack-netlib/DOCS/lawn81.tex +++ b/lapack-netlib/DOCS/lawn81.tex @@ -439,39 +439,39 @@ SHELL = /bin/sh \end{quote} and it will need to be modified to \texttt{SHELL = /sbin/sh} if you are installing LAPACK on an SGI architecture. -Second, you will -need to modify the \texttt{PLAT} definition, which is appended to all -library names, to specify the architecture to which you are installing -LAPACK. This features avoids confusion in library names when you are -installing LAPACK on more than one architecture. Next, you will need -to modify \texttt{FORTRAN}, \texttt{OPTS}, \texttt{DRVOPTS}, \texttt{NOOPT}, \texttt{LOADER}, -and \texttt{LOADOPTS} to specify +Next, you will need to modify \texttt{FC}, \texttt{FFLAGS}, +\texttt{FFLAGS\_DRV}, \texttt{FFLAGS\_NOOPT}, and \texttt{LDFLAGS} to specify the compiler, compiler options, compiler options for the testing and -timing\footnotemark[\value{footnote}] main programs, loader, loader options. -Next you will have to choose which function you will use to time in the \texttt{SECOND} and \texttt{DSECND} routines. +timing\footnotemark[\value{footnote}] main programs, and linker options. +Next you will have to choose which function you will use to time in the +\texttt{SECOND} and \texttt{DSECND} routines. \begin{verbatim} -#The Default : SECOND and DSECND will use a call to the EXTERNAL FUNCTION ETIME -TIMER = EXT_ETIME -# For RS6K : SECOND and DSECND will use a call to the EXTERNAL FUNCTION ETIME_ -# TIMER = EXT_ETIME_ -# For gfortran compiler: SECOND and DSECND will use the INTERNAL FUNCTION ETIME -# TIMER = INT_ETIME -# If your Fortran compiler does not provide etime (like Nag Fortran Compiler, etc...) -# SECOND and DSECND will use a call to the INTERNAL FUNCTION CPU_TIME -# TIMER = INT_CPU_TIME -# If neither of this works...you can use the NONE value... -# In that case, SECOND and DSECND will always return 0 -# TIMER = NONE +# Default: SECOND and DSECND will use a call to the +# EXTERNAL FUNCTION ETIME +#TIMER = EXT_ETIME +# For RS6K: SECOND and DSECND will use a call to the +# EXTERNAL FUNCTION ETIME_ +#TIMER = EXT_ETIME_ +# For gfortran compiler: SECOND and DSECND will use a call to the +# INTERNAL FUNCTION ETIME +TIMER = INT_ETIME +# If your Fortran compiler does not provide etime (like Nag Fortran +# Compiler, etc...) SECOND and DSECND will use a call to the +# INTERNAL FUNCTION CPU_TIME +#TIMER = INT_CPU_TIME +# If none of these work, you can use the NONE value. +# In that case, SECOND and DSECND will always return 0. +#TIMER = NONE \end{verbatim} Refer to the section~\ref{second} to get more information. -Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver, +Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver, archiver options, and ranlib for your machine. If your architecture does not require \texttt{ranlib} to be run after each archive command (as is the case with CRAY computers running UNICOS, Hewlett Packard computers running HP-UX, or SUN SPARCstations running Solaris), set -\texttt{ranlib=echo}. And finally, you must +\texttt{RANLIB = echo}. And finally, you must modify the \texttt{BLASLIB} definition to specify the BLAS library to which you will be linking. If an optimized version of the BLAS is available on your machine, you are highly recommended to link to that library. @@ -721,24 +721,24 @@ The version that will be used depends on the value of the TIMER variable in the \begin{itemize} \item If ETIME is available as an external function, set the value of the TIMER variable in your -make.inc to \texttt{EXT\_ETIME}:\texttt{second\_EXT\_ETIME.f} and \texttt{dsecnd\_EXT\_ETIME.f} will be used. +make.inc to \texttt{EXT\_ETIME}: \texttt{second\_EXT\_ETIME.f} and \texttt{dsecnd\_EXT\_ETIME.f} will be used. Usually on HPPA architectures, -the compiler and loader flag \texttt{+U77} should be included to access +the compiler and linker flag \texttt{+U77} should be included to access the function \texttt{ETIME}. \item If ETIME\_ is available as an external function, set the value of the TIMER variable in your make.inc -to \texttt{EXT\_ETIME\_}:\texttt{second\_EXT\_ETIME\_.f} and \texttt{dsecnd\_EXT\_ETIME\_.f} will be used. +to \texttt{EXT\_ETIME\_}: \texttt{second\_EXT\_ETIME\_.f} and \texttt{dsecnd\_EXT\_ETIME\_.f} will be used. It is the case on some IBM architectures such as IBM RS/6000s. \item If ETIME is available as an internal function, set the value of the TIMER variable in your make.inc -to \texttt{INT\_ETIME}:\texttt{second\_INT\_ETIME.f} and \texttt{dsecnd\_INT\_ETIME.f} will be used. +to \texttt{INT\_ETIME}: \texttt{second\_INT\_ETIME.f} and \texttt{dsecnd\_INT\_ETIME.f} will be used. This is the case with gfortan. \item If CPU\_TIME is available as an internal function, set the value of the TIMER variable in your make.inc -to \texttt{INT\_CPU\_TIME}:\texttt{second\_INT\_CPU\_TIME.f} and \texttt{dsecnd\_INT\_CPU\_TIME.f} will be used. +to \texttt{INT\_CPU\_TIME}: \texttt{second\_INT\_CPU\_TIME.f} and \texttt{dsecnd\_INT\_CPU\_TIME.f} will be used. \item If none of these function is available, set the value of the TIMER variable in your make.inc -to \texttt{NONE:}\texttt{second\_NONE.f} and \texttt{dsecnd\_NONE.f} will be used. +to \texttt{NONE}: \texttt{second\_NONE.f} and \texttt{dsecnd\_NONE.f} will be used. These routines will always return zero. \end{itemize} @@ -829,8 +829,8 @@ data type to the library if necessary. \end{itemize} \noindent -The BLAS library is created in \texttt{LAPACK/blas\_PLAT.a}, where -\texttt{PLAT} is the user-defined architecture suffix specified in the file +The BLAS library is created in \texttt{LAPACK/librefblas.a}, +or in the user-defined location specified by \texttt{BLASLIB} in the file \texttt{LAPACK/make.inc}. \subsection{Run the BLAS Test Programs}\label{testblas} @@ -882,8 +882,8 @@ data type to the library if necessary. \end{itemize} \noindent -The LAPACK library is created in \texttt{LAPACK/lapack\_PLAT.a}, where -\texttt{PLAT} is the user-defined architecture suffix specified in the file +The LAPACK library is created in \texttt{LAPACK/liblapack.a}, +or in the user-defined location specified by \texttt{LAPACKLIB} in the file \texttt{LAPACK/make.inc}. \subsection{Create the Test Matrix Generator Library} @@ -902,9 +902,9 @@ data type to the library if necessary. \end{itemize} \noindent -The test matrix generator library is created in \texttt{LAPACK/tmglib\_PLAT.a}, -where \texttt{PLAT} is the user-defined architecture suffix specified in the -file \texttt{LAPACK/make.inc}. +The test matrix generator library is created in \texttt{LAPACK/libtmglib.a}, +or in the user-defined location specified by \texttt{TMGLIB} in the file +\texttt{LAPACK/make.inc}. \subsection{Run the LAPACK Test Programs} @@ -1114,9 +1114,7 @@ To make a library of the instrumented LAPACK routines, first go to \texttt{LAPACK/TIMING/LIN/LINSRC} and type \texttt{make} followed by the data types desired, as in the examples of Section~\ref{toplevelmakefile}. The library of instrumented code is created in -\texttt{LAPACK/TIMING/LIN/linsrc\_PLAT.a}, -where \texttt{PLAT} is the user-defined architecture suffix specified in the -file \texttt{LAPACK/make.inc}. +\texttt{LAPACK/TIMING/LIN/linsrc.a}. \end{sloppypar} \item[b)] @@ -1251,9 +1249,7 @@ To make a library of the instrumented LAPACK routines, first go to \texttt{LAPACK/TIMING/EIG/EIGSRC} and type \texttt{make} followed by the data types desired, as in the examples of Section~\ref{toplevelmakefile}. The library of instrumented code is created in -\texttt{LAPACK/TIMING/EIG/eigsrc\_PLAT.a}, -where \texttt{PLAT} is the user-defined architecture suffix specified in the -file \texttt{LAPACK/make.inc}. +\texttt{LAPACK/TIMING/EIG/eigsrc.a}. \end{sloppypar} \item[b)] @@ -1389,7 +1385,7 @@ installing LAPACK on an SGI architecture. \section{ETIME} On HPPA architectures, -the compiler and loader flag \texttt{+U77} should be included to access +the compiler and linker flag \texttt{+U77} should be included to access the function \texttt{ETIME}. \section{ILAENV and IEEE-754 compliance} @@ -1494,13 +1490,13 @@ has two options: increase your stack size, or force all local variables to be allocated statically. On HPPA architectures, the -compiler and loader flag \texttt{-K} should be used when compiling these testing +compiler and linker flag \texttt{-K} should be used when compiling these testing and timing main programs to avoid such a stack overflow. I.e., set -\texttt{DRVOPTS = -K} in the \texttt{LAPACK/make.inc} file. +\texttt{FFLAGS\_DRV = -K} in the \texttt{LAPACK/make.inc} file. For similar reasons, -on SGI architectures, the compiler and loader flag \texttt{-static} should be -used. I.e., set \texttt{DRVOPTS = -static} in the \texttt{LAPACK/make.inc} file. +on SGI architectures, the compiler and linker flag \texttt{-static} should be +used. I.e., set \texttt{FFLAGS\_DRV = -static} in the \texttt{LAPACK/make.inc} file. \section{IEEE arithmetic} diff --git a/lapack-netlib/INSTALL/Makefile b/lapack-netlib/INSTALL/Makefile index 150a061d6..1007c1bca 100644 --- a/lapack-netlib/INSTALL/Makefile +++ b/lapack-netlib/INSTALL/Makefile @@ -1,30 +1,33 @@ -include ../make.inc +TOPSRCDIR = .. +include $(TOPSRCDIR)/make.inc +.PHONY: all testlsame testslamch testdlamch testsecond testdsecnd testieee testversion all: testlsame testslamch testdlamch testsecond testdsecnd testieee testversion testlsame: lsame.o lsametst.o - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ testslamch: slamch.o lsame.o slamchtst.o - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ testdlamch: dlamch.o lsame.o dlamchtst.o - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ testsecond: second_$(TIMER).o secondtst.o @echo "[INFO] : TIMER value: $(TIMER) (given by make.inc)" - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ testdsecnd: dsecnd_$(TIMER).o dsecndtst.o @echo "[INFO] : TIMER value: $(TIMER) (given by make.inc)" - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ testieee: tstiee.o - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ testversion: ilaver.o LAPACK_version.o - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +.PHONY: run run: all ./testlsame ./testslamch @@ -34,6 +37,7 @@ run: all ./testieee ./testversion +.PHONY: clean cleanobj cleanexe cleantest clean: cleanobj cleanexe cleantest cleanobj: rm -f *.o @@ -42,9 +46,5 @@ cleanexe: cleantest: rm -f core -.SUFFIXES: .o .f -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< - -slamch.o: slamch.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< -dlamch.o: dlamch.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< +slamch.o: slamch.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +dlamch.o: dlamch.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< diff --git a/lapack-netlib/INSTALL/dlamch.f b/lapack-netlib/INSTALL/dlamch.f index 76f875cef..9073cd45e 100644 --- a/lapack-netlib/INSTALL/dlamch.f +++ b/lapack-netlib/INSTALL/dlamch.f @@ -10,6 +10,10 @@ * * DOUBLE PRECISION FUNCTION DLAMCH( CMACH ) * +* .. Scalar Arguments .. +* CHARACTER CMACH +* .. +* * *> \par Purpose: * ============= @@ -24,6 +28,7 @@ * *> \param[in] CMACH *> \verbatim +*> CMACH is CHARACTER*1 *> Specifies the value to be returned by DLAMCH: *> = 'E' or 'e', DLAMCH := eps *> = 'S' or 's , DLAMCH := sfmin diff --git a/lapack-netlib/INSTALL/dlamchf77.f b/lapack-netlib/INSTALL/dlamchf77.f index 3efd21535..37b30551f 100644 --- a/lapack-netlib/INSTALL/dlamchf77.f +++ b/lapack-netlib/INSTALL/dlamchf77.f @@ -10,6 +10,10 @@ * * DOUBLE PRECISION FUNCTION DLAMCH( CMACH ) * +* .. Scalar Arguments .. +* CHARACTER CMACH +* .. +* * *> \par Purpose: * ============= diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f index e1d59f465..79fe597ae 100644 --- a/lapack-netlib/INSTALL/ilaver.f +++ b/lapack-netlib/INSTALL/ilaver.f @@ -25,12 +25,15 @@ * ========== * *> \param[out] VERS_MAJOR +*> VERS_MAJOR is INTEGER *> return the lapack major version *> *> \param[out] VERS_MINOR +*> VERS_MINOR is INTEGER *> return the lapack minor version from the major version *> *> \param[out] VERS_PATCH +*> VERS_PATCH is INTEGER *> return the lapack patch version from the minor version * * Authors: @@ -41,24 +44,23 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2017 +*> \date November 2019 * *> \ingroup auxOTHERauxiliary * * ===================================================================== SUBROUTINE ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) * -* -- LAPACK computational routine (version 3.7.1) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2017 * * ===================================================================== * INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH * ===================================================================== VERS_MAJOR = 3 - VERS_MINOR = 8 + VERS_MINOR = 9 VERS_PATCH = 0 * ===================================================================== * diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA index 0ceeaa155..d6397e81d 100644 --- a/lapack-netlib/INSTALL/make.inc.ALPHA +++ b/lapack-netlib/INSTALL/make.inc.ALPHA @@ -8,30 +8,28 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = cc +CC = cc CFLAGS = -O4 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = f77 -OPTS = -O4 -fpe1 -DRVOPTS = $(OPTS) -NOOPT = +FC = f77 +FFLAGS = -O4 -fpe1 +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = f77 -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # Timer for the SECOND and DSECND routines # @@ -74,9 +72,9 @@ TIMER = EXT_ETIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -#BLASLIB = ../../librefblas.a +#BLASLIB = $(TOPSRCDIR)/librefblas.a BLASLIB = -ldxml -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA index 8eabbbdf4..6ee2b2dfb 100644 --- a/lapack-netlib/INSTALL/make.inc.HPPA +++ b/lapack-netlib/INSTALL/make.inc.HPPA @@ -8,30 +8,28 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = cc +CC = cc CFLAGS = -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = f77 -OPTS = +O4 +U77 -DRVOPTS = $(OPTS) -K -NOOPT = +U77 +FC = f77 +FFLAGS = +O4 +U77 +FFLAGS_DRV = $(FFLAGS) -K +FFLAGS_NOOPT = +U77 -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = f77 -LOADOPTS = -Aa +U77 +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # Timer for the SECOND and DSECND routines # @@ -74,9 +72,9 @@ TIMER = EXT_ETIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -#BLASLIB = ../../librefblas.a +#BLASLIB = $(TOPSRCDIR)/librefblas.a BLASLIB = -lblas -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64 index d9e71e1bf..59fe522eb 100644 --- a/lapack-netlib/INSTALL/make.inc.IRIX64 +++ b/lapack-netlib/INSTALL/make.inc.IRIX64 @@ -8,33 +8,30 @@ SHELL = /sbin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = cc +CC = cc CFLAGS = -O3 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = f77 -OPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON -#OPTS = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON -DRVOPTS = $(OPTS) -static -NOOPT = -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON -#NOOPT = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON +FC = f77 +FFLAGS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON +#FFLAGS = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON +FFLAGS_DRV = $(FFLAGS) -static +FFLAGS_NOOPT = -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON +#FFLAGS_NOOPT = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = f77 -LOADOPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON -#LOADOPTS = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # Timer for the SECOND and DSECND routines # @@ -78,8 +75,8 @@ TIMER = EXT_ETIME # possible.) # #BLASLIB = -lblas -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K index 3ffcadacc..3c3dbc800 100644 --- a/lapack-netlib/INSTALL/make.inc.O2K +++ b/lapack-netlib/INSTALL/make.inc.O2K @@ -8,33 +8,30 @@ SHELL = /sbin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = cc +CC = cc CFLAGS = -O3 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = f77 -OPTS = -O3 -64 -mips4 -r10000 -#OPTS = -O3 -64 -mips4 -r10000 -mp -DRVOPTS = $(OPTS) -static -NOOPT = -64 -mips4 -r10000 -#NOOPT = -64 -mips4 -r10000 -mp +FC = f77 +FFLAGS = -O3 -64 -mips4 -r10000 +#FFLAGS = -O3 -64 -mips4 -r10000 -mp +FFLAGS_DRV = $(FFLAGS) -static +FFLAGS_NOOPT = -64 -mips4 -r10000 +#FFLAGS_NOOPT = -64 -mips4 -r10000 -mp -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = f77 -LOADOPTS = -O3 -64 -mips4 -r10000 -#LOADOPTS = -O3 -64 -mips4 -r10000 -mp +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # Timer for the SECOND and DSECND routines # @@ -79,8 +76,8 @@ TIMER = EXT_ETIME # BLASLIB = -lblas #BLASLIB = -lblas_mp -#BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +#BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5 index c7019ac16..1013cffdb 100644 --- a/lapack-netlib/INSTALL/make.inc.SGI5 +++ b/lapack-netlib/INSTALL/make.inc.SGI5 @@ -8,30 +8,28 @@ SHELL = /sbin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = cc +CC = cc CFLAGS = -O4 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = f77 -OPTS = -O4 -DRVOPTS = $(OPTS) -static -NOOPT = +FC = f77 +FFLAGS = -O4 +FFLAGS_DRV = $(FFLAGS) -static +FFLAGS_NOOPT = -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = f77 -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # Timer for the SECOND and DSECND routines # @@ -75,8 +73,8 @@ TIMER = EXT_ETIME # possible.) # #BLASLIB = -lblas -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4 index 4e44f1beb..2da0ecb65 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4 +++ b/lapack-netlib/INSTALL/make.inc.SUN4 @@ -8,30 +8,28 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = cc +CC = cc CFLAGS = -O3 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = f77 -OPTS = -dalign -O4 -fast -DRVOPTS = $(OPTS) -NOOPT = +FC = f77 +FFLAGS = -dalign -O4 -fast +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = f77 -LOADOPTS = -dalign -O4 -fast +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # Timer for the SECOND and DSECND routines # @@ -75,8 +73,8 @@ TIMER = EXT_ETIME # possible.) # #BLASLIB = -lblas -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 index e6d79add3..d2db07c61 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 +++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 @@ -8,34 +8,31 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = cc +CC = cc CFLAGS = -O3 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = f77 -#OPTS = -O4 -u -f -mt -#OPTS = -u -f -dalign -native -xO5 -xarch=v8plusa -OPTS = -u -f -dalign -native -xO2 -xarch=v8plusa -DRVOPTS = $(OPTS) -NOOPT = -u -f -#NOOPT = -u -f -mt +FC = f77 +#FFLAGS = -O4 -u -f -mt +#FFLAGS = -u -f -dalign -native -xO5 -xarch=v8plusa +FFLAGS = -u -f -dalign -native -xO2 -xarch=v8plusa +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -u -f +#FFLAGS_NOOPT = -u -f -mt -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = f77 -#LOADOPTS = -mt -LOADOPTS = -f -dalign -native -xO2 -xarch=v8plusa +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # Timer for the SECOND and DSECND routines # @@ -78,10 +75,10 @@ TIMER = EXT_ETIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -#BLASLIB = ../../librefblas.a +#BLASLIB = $(TOPSRCDIR)/librefblas.a #BLASLIB = -xlic_lib=sunperf_mt BLASLIB = -xlic_lib=sunperf -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF index 9466ee332..cb9d791a7 100644 --- a/lapack-netlib/INSTALL/make.inc.XLF +++ b/lapack-netlib/INSTALL/make.inc.XLF @@ -8,31 +8,29 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = xlc +CC = xlc CFLAGS = -O3 -qnosave -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = xlf -OPTS = -O3 -qfixed -qnosave +FC = xlf +FFLAGS = -O3 -qfixed -qnosave # For -O2, add -qstrict=none -DRVOPTS = $(OPTS) -NOOPT = -O0 -qfixed -qnosave +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -O0 -qfixed -qnosave -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = xlf -LOADOPTS = -qnosave +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # Timer for the SECOND and DSECND routines # @@ -75,9 +73,9 @@ TIMER = EXT_ETIME_ # machine-specific, optimized BLAS library should be used whenever # possible.) # -#BLASLIB = ../../librefblas.a +#BLASLIB = $(TOPSRCDIR)/librefblas.a BLASLIB = -lessl -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran index 39d98d4d4..104632747 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran +++ b/lapack-netlib/INSTALL/make.inc.gfortran @@ -8,10 +8,10 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = gcc +CC = gcc CFLAGS = -O3 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # @@ -19,23 +19,21 @@ CFLAGS = -O3 # and handle these quantities appropriately. As a consequence, one # should not compile LAPACK with flags such as -ffpe-trap=overflow. # -FORTRAN = gfortran -OPTS = -O2 -frecursive -DRVOPTS = $(OPTS) -NOOPT = -O0 -frecursive +FC = gfortran +FFLAGS = -O2 -frecursive +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -O0 -frecursive -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = gfortran -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # Timer for the SECOND and DSECND routines # @@ -78,8 +76,8 @@ TIMER = INT_ETIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug index 10e6381df..246060827 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran_debug +++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug @@ -8,10 +8,10 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = gcc +CC = gcc CFLAGS = -g -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # @@ -19,23 +19,21 @@ CFLAGS = -g # and handle these quantities appropriately. As a consequence, one # should not compile LAPACK with flags such as -ffpe-trap=overflow. # -FORTRAN = gfortran -fimplicit-none -g -frecursive -OPTS = -DRVOPTS = $(OPTS) -NOOPT = -g -O0 -frecursive +FC = gfortran +FFLAGS = -fimplicit-none -g -frecursive +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = $(FFLAGS) -O0 -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = gfortran -g -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # Timer for the SECOND and DSECND routines # @@ -78,8 +76,8 @@ TIMER = INT_CPU_TIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort index b067bd484..801b46aa5 100644 --- a/lapack-netlib/INSTALL/make.inc.ifort +++ b/lapack-netlib/INSTALL/make.inc.ifort @@ -8,30 +8,28 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = icc +CC = icc CFLAGS = -O3 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = ifort -OPTS = -O3 -fp-model strict -assume protect_parens -DRVOPTS = $(OPTS) -NOOPT = -O0 -fp-model strict -assume protect_parens +FC = ifort +FFLAGS = -O3 -fp-model strict -assume protect_parens +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -O0 -fp-model strict -assume protect_parens -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = ifort -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # Timer for the SECOND and DSECND routines # @@ -74,8 +72,8 @@ TIMER = EXT_ETIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95 index a9a5cec98..87b691cdd 100644 --- a/lapack-netlib/INSTALL/make.inc.pgf95 +++ b/lapack-netlib/INSTALL/make.inc.pgf95 @@ -8,30 +8,28 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = pgcc +CC = pgcc CFLAGS = -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = pgf95 -OPTS = -O3 -DRVOPTS = $(OPTS) -NOOPT = -O0 +FC = pgf95 +FFLAGS = -O3 +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -O0 -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = $(FORTRAN) -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # Timer for the SECOND and DSECND routines # @@ -74,8 +72,8 @@ TIMER = INT_CPU_TIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf index 1d9bf549c..97d22d27d 100644 --- a/lapack-netlib/INSTALL/make.inc.pghpf +++ b/lapack-netlib/INSTALL/make.inc.pghpf @@ -8,30 +8,28 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = pghpc +CC = pghpc CFLAGS = -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # -FORTRAN = pghpf -OPTS = -O4 -Mnohpfc -Mdclchk -DRVOPTS = $(OPTS) -NOOPT = -Mnohpfc -Mdclchk +FC = pghpf +FFLAGS = -O4 -Mnohpfc -Mdclchk +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -Mnohpfc -Mdclchk -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = pghpf -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = echo +AR = ar +ARFLAGS = cr +RANLIB = echo # Timer for the SECOND and DSECND routines # @@ -75,8 +73,8 @@ TIMER = EXT_ETIME # possible.) # #BLASLIB = -lessl -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/INSTALL/slamch.f b/lapack-netlib/INSTALL/slamch.f index 3282fa6a3..342f446ff 100644 --- a/lapack-netlib/INSTALL/slamch.f +++ b/lapack-netlib/INSTALL/slamch.f @@ -28,6 +28,7 @@ * *> \param[in] CMACH *> \verbatim +*> CMACH is CHARACTER*1 *> Specifies the value to be returned by SLAMCH: *> = 'E' or 'e', SLAMCH := eps *> = 'S' or 's , SLAMCH := sfmin diff --git a/lapack-netlib/LAPACKE/CMakeLists.txt b/lapack-netlib/LAPACKE/CMakeLists.txt index 42faef5dd..0589a74ba 100644 --- a/lapack-netlib/LAPACKE/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/CMakeLists.txt @@ -16,18 +16,16 @@ if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND) ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h) endif() -if(WIN32 AND NOT UNIX) - add_definitions(-DHAVE_LAPACK_CONFIG_H -DLAPACK_COMPLEX_STRUCTURE) - message(STATUS "Windows BUILD") -endif() - -get_directory_property(DirDefs COMPILE_DEFINITIONS) - include_directories(include ${LAPACK_BINARY_DIR}/include) add_subdirectory(include) add_subdirectory(src) add_subdirectory(utils) +option(LAPACKE_BUILD_SINGLE "Build LAPACKE single precision real" ON) +option(LAPACKE_BUILD_DOUBLE "Build LAPACKE double precision real" ON) +option(LAPACKE_BUILD_COMPLEX "Build LAPACKE single precision complex" ON) +option(LAPACKE_BUILD_COMPLEX16 "Build LAPACKE double precision complex" ON) + macro(append_subdir_files variable dirname) get_directory_property(holder DIRECTORY ${dirname} DEFINITION ${variable}) foreach(depfile ${holder}) @@ -35,8 +33,29 @@ macro(append_subdir_files variable dirname) endforeach() endmacro() +message(STATUS "Build LAPACKE single precision real: ${LAPACKE_BUILD_SINGLE}") +message(STATUS "Build LAPACKE double precision real: ${LAPACKE_BUILD_DOUBLE}") +message(STATUS "Build LAPACKE single precision complex: ${LAPACKE_BUILD_COMPLEX}") +message(STATUS "Build LAPACKE double precision complex: ${LAPACKE_BUILD_COMPLEX16}") + append_subdir_files(LAPACKE_INCLUDE "include") append_subdir_files(SOURCES "src") +if (LAPACKE_BUILD_SINGLE) + append_subdir_files(SOURCES_SINGLE "src") + list(APPEND SOURCES ${SOURCES_SINGLE}) +endif() +if (LAPACKE_BUILD_DOUBLE) + append_subdir_files(SOURCES_DOUBLE "src") + list(APPEND SOURCES ${SOURCES_DOUBLE}) +endif() +if (LAPACKE_BUILD_COMPLEX) + append_subdir_files(SOURCES_COMPLEX "src") + list(APPEND SOURCES ${SOURCES_COMPLEX}) +endif() +if (LAPACKE_BUILD_COMPLEX16) + append_subdir_files(SOURCES_COMPLEX16 "src") + list(APPEND SOURCES ${SOURCES_COMPLEX16}) +endif() append_subdir_files(DEPRECATED "src") append_subdir_files(EXTENDED "src") append_subdir_files(MATGEN "src") @@ -61,9 +80,13 @@ set_target_properties( SOVERSION ${LAPACK_MAJOR_VERSION} ) target_include_directories(lapacke PUBLIC - $ + $ $ ) +if(WIN32 AND NOT UNIX) + target_compile_definitions(lapacke PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE) + message(STATUS "Windows BUILD") +endif() if(LAPACKE_WITH_TMG) target_link_libraries(lapacke PRIVATE tmglib) @@ -71,7 +94,11 @@ endif() target_link_libraries(lapacke PRIVATE ${LAPACK_LIBRARIES}) lapack_install_library(lapacke) -install(FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +install( + FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + COMPONENT Development + ) if(BUILD_TESTING) add_subdirectory(example) @@ -82,6 +109,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc DESTINATION ${PKG_CONFIG_DIR} + COMPONENT Development ) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-version.cmake.in @@ -95,7 +123,10 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake ${LAPACK_BINARY_DIR}/lapacke-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION} + COMPONENT Development ) install(EXPORT lapacke-targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION}) + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION} + COMPONENT Development + ) diff --git a/lapack-netlib/LAPACKE/Makefile b/lapack-netlib/LAPACKE/Makefile index 016f8a2f2..a358d7c9f 100644 --- a/lapack-netlib/LAPACKE/Makefile +++ b/lapack-netlib/LAPACKE/Makefile @@ -40,22 +40,26 @@ # To clean everything including lapacke library type # 'make cleanall' # -include ../make.inc +TOPSRCDIR = .. +include $(TOPSRCDIR)/make.inc +.PHONY: all all: lapacke +.PHONY: lapacke lapacke: include/lapacke_mangling.h $(MAKE) -C src $(MAKE) -C utils include/lapacke_mangling.h: include/lapacke_mangling_with_flags.h.in - cp $< $@ + cp include/lapacke_mangling_with_flags.h.in $@ +.PHONY: lapacke_example lapacke_example: lapacke $(MAKE) -C example -#clean: cleanlib -clean: cleanobj +.PHONY: clean cleanobj cleanlib cleanexe +clean: $(MAKE) -C src clean $(MAKE) -C utils clean $(MAKE) -C example clean @@ -64,6 +68,6 @@ cleanobj: $(MAKE) -C utils cleanobj $(MAKE) -C example cleanobj cleanlib: - rm -f ../$(LAPACKELIB) + $(MAKE) -C src cleanlib cleanexe: $(MAKE) -C example cleanexe diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in index 6900f4533..0a1350172 100644 --- a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in +++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in @@ -7,8 +7,11 @@ if(NOT TARGET lapacke) include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") endif() +# Hint for project building against lapack +set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID}) + # Report lapacke header search locations from build tree. set(LAPACKE_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include") # Report lapacke libraries. -set(LAPACKE_LIBRARIES lapacke) +set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES}) diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in index caa459a24..57a5c2b2f 100644 --- a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in +++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in @@ -13,11 +13,14 @@ if(NOT TARGET lapacke) include(${_LAPACKE_SELF_DIR}/lapacke-targets.cmake) endif() +# Hint for project building against lapack +set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID}) + # Report lapacke header search locations. set(LAPACKE_INCLUDE_DIRS ${_LAPACKE_PREFIX}/include) # Report lapacke libraries. -set(LAPACKE_LIBRARIES lapacke) +set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES}) unset(_LAPACKE_PREFIX) unset(_LAPACKE_SELF_DIR) diff --git a/lapack-netlib/LAPACKE/example/Makefile b/lapack-netlib/LAPACKE/example/Makefile index f959a2be0..77526dc42 100644 --- a/lapack-netlib/LAPACKE/example/Makefile +++ b/lapack-netlib/LAPACKE/example/Makefile @@ -1,34 +1,38 @@ -include ../../make.inc +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc +.SUFFIXES: .c .o +.c.o: + $(CC) $(CFLAGS) -I. -I../include -c -o $@ $< + +.PHONY: all all: xexample_DGESV_rowmajor \ xexample_DGESV_colmajor \ xexample_DGELS_rowmajor \ xexample_DGELS_colmajor -LIBRARIES = ../../$(LAPACKELIB) ../../$(LAPACKLIB) $(BLASLIB) +LIBRARIES = $(LAPACKELIB) $(LAPACKLIB) $(BLASLIB) # Double Precision Examples xexample_DGESV_rowmajor: example_DGESV_rowmajor.o lapacke_example_aux.o $(LIBRARIES) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ ./$@ xexample_DGESV_colmajor: example_DGESV_colmajor.o lapacke_example_aux.o $(LIBRARIES) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ ./$@ xexample_DGELS_rowmajor: example_DGELS_rowmajor.o lapacke_example_aux.o $(LIBRARIES) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ ./$@ xexample_DGELS_colmajor: example_DGELS_colmajor.o lapacke_example_aux.o $(LIBRARIES) - $(LOADER) $(LOADOPTS) -o $@ $^ + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ ./$@ +.PHONY: clean cleanobj cleanexe clean: cleanobj cleanexe cleanobj: rm -f *.o cleanexe: rm -f x* - -.c.o: - $(CC) $(CFLAGS) -I. -I../include -c -o $@ $< diff --git a/lapack-netlib/LAPACKE/include/CMakeLists.txt b/lapack-netlib/LAPACKE/include/CMakeLists.txt index 4c30c0501..b690dc554 100644 --- a/lapack-netlib/LAPACKE/include/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/include/CMakeLists.txt @@ -1,3 +1,3 @@ -set(LAPACKE_INCLUDE lapacke.h lapacke_config.h lapacke_utils.h) +set(LAPACKE_INCLUDE lapacke.h lapack.h lapacke_config.h lapacke_utils.h) file(COPY ${LAPACKE_INCLUDE} DESTINATION ${LAPACK_BINARY_DIR}/include) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h new file mode 100644 index 000000000..0a6226fe4 --- /dev/null +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -0,0 +1,13715 @@ +#ifndef LAPACK_H +#define LAPACK_H + +/* +* Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes +*/ +#ifdef HAVE_LAPACK_CONFIG_H +#include "lapacke_config.h" +#endif + +#include "lapacke_mangling.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*----------------------------------------------------------------------------*/ +#ifndef lapack_int +#define lapack_int int +#endif + +#ifndef lapack_logical +#define lapack_logical lapack_int +#endif + +/* f2c, hence clapack and MacOS Accelerate, returns double instead of float + * for sdot, slange, clange, etc. */ +#if defined(LAPACK_F2C) + typedef double lapack_float_return; +#else + typedef float lapack_float_return; +#endif + +/* Complex types are structures equivalent to the +* Fortran complex types COMPLEX(4) and COMPLEX(8). +* +* One can also redefine the types with his own types +* for example by including in the code definitions like +* +* #define lapack_complex_float std::complex +* #define lapack_complex_double std::complex +* +* or define these types in the command line: +* +* -Dlapack_complex_float="std::complex" +* -Dlapack_complex_double="std::complex" +*/ + +#ifndef LAPACK_COMPLEX_CUSTOM + +/* Complex type (single precision) */ +#ifndef lapack_complex_float +#ifndef __cplusplus +#include +#else +#include +#endif +#define lapack_complex_float float _Complex +#endif + +#ifndef lapack_complex_float_real +#define lapack_complex_float_real(z) (creal(z)) +#endif + +#ifndef lapack_complex_float_imag +#define lapack_complex_float_imag(z) (cimag(z)) +#endif + +/* Complex type (double precision) */ +#ifndef lapack_complex_double +#ifndef __cplusplus +#include +#else +#include +#endif +#define lapack_complex_double double _Complex +#endif + +#ifndef lapack_complex_double_real +#define lapack_complex_double_real(z) (creal(z)) +#endif + +#ifndef lapack_complex_double_imag +#define lapack_complex_double_imag(z) (cimag(z)) +#endif + +#endif /* LAPACK_COMPLEX_CUSTOM */ + +/* Callback logical functions of one, two, or three arguments are used +* to select eigenvalues to sort to the top left of the Schur form. +* The value is selected if function returns TRUE (non-zero). */ + +typedef lapack_logical (*LAPACK_S_SELECT2) ( const float*, const float* ); +typedef lapack_logical (*LAPACK_S_SELECT3) + ( const float*, const float*, const float* ); +typedef lapack_logical (*LAPACK_D_SELECT2) ( const double*, const double* ); +typedef lapack_logical (*LAPACK_D_SELECT3) + ( const double*, const double*, const double* ); + +typedef lapack_logical (*LAPACK_C_SELECT1) ( const lapack_complex_float* ); +typedef lapack_logical (*LAPACK_C_SELECT2) + ( const lapack_complex_float*, const lapack_complex_float* ); +typedef lapack_logical (*LAPACK_Z_SELECT1) ( const lapack_complex_double* ); +typedef lapack_logical (*LAPACK_Z_SELECT2) + ( const lapack_complex_double*, const lapack_complex_double* ); + +#define LAPACK_lsame LAPACK_GLOBAL(lsame,LSAME) +lapack_logical LAPACK_lsame( char* ca, char* cb, + lapack_int lca, lapack_int lcb ); + + +/*----------------------------------------------------------------------------*/ +/* This is in alphabetical order (ignoring leading precision). */ + +#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd,CBBCSD) +void LAPACK_cbbcsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + float* theta, + float* phi, + lapack_complex_float* U1, lapack_int const* ldu1, + lapack_complex_float* U2, lapack_int const* ldu2, + lapack_complex_float* V1T, lapack_int const* ldv1t, + lapack_complex_float* V2T, lapack_int const* ldv2t, + float* B11D, + float* B11E, + float* B12D, + float* B12E, + float* B21D, + float* B21E, + float* B22D, + float* B22E, + float* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd,DBBCSD) +void LAPACK_dbbcsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + double* theta, + double* phi, + double* U1, lapack_int const* ldu1, + double* U2, lapack_int const* ldu2, + double* V1T, lapack_int const* ldv1t, + double* V2T, lapack_int const* ldv2t, + double* B11D, + double* B11E, + double* B12D, + double* B12E, + double* b21d, + double* b21e, + double* b22d, + double* b22e, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd,SBBCSD) +void LAPACK_sbbcsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + float* theta, + float* phi, + float* U1, lapack_int const* ldu1, + float* U2, lapack_int const* ldu2, + float* V1T, lapack_int const* ldv1t, + float* V2T, lapack_int const* ldv2t, + float* B11D, + float* B11E, + float* B12D, + float* B12E, + float* B21D, + float* B21E, + float* B22D, + float* B22E, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd,ZBBCSD) +void LAPACK_zbbcsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + double* theta, + double* phi, + lapack_complex_double* U1, lapack_int const* ldu1, + lapack_complex_double* U2, lapack_int const* ldu2, + lapack_complex_double* V1T, lapack_int const* ldv1t, + lapack_complex_double* V2T, lapack_int const* ldv2t, + double* B11D, + double* B11E, + double* B12D, + double* B12E, + double* B21D, + double* B21E, + double* B22D, + double* B22E, + double* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc,DBDSDC) +void LAPACK_dbdsdc( + char const* uplo, char const* compq, + lapack_int const* n, + double* D, + double* E, + double* U, lapack_int const* ldu, + double* VT, lapack_int const* ldvt, + double* Q, lapack_int* IQ, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc,SBDSDC) +void LAPACK_sbdsdc( + char const* uplo, char const* compq, + lapack_int const* n, + float* D, + float* E, + float* U, lapack_int const* ldu, + float* VT, lapack_int const* ldvt, + float* Q, lapack_int* IQ, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr,CBDSQR) +void LAPACK_cbdsqr( + char const* uplo, + lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc, + float* D, + float* E, + lapack_complex_float* VT, lapack_int const* ldvt, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* C, lapack_int const* ldc, + float* rwork, + lapack_int* info ); + +#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr,DBDSQR) +void LAPACK_dbdsqr( + char const* uplo, + lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc, + double* D, + double* E, + double* VT, lapack_int const* ldvt, + double* U, lapack_int const* ldu, + double* C, lapack_int const* ldc, + double* work, + lapack_int* info ); + +#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr,SBDSQR) +void LAPACK_sbdsqr( + char const* uplo, + lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc, + float* D, + float* E, + float* VT, lapack_int const* ldvt, + float* U, lapack_int const* ldu, + float* C, lapack_int const* ldc, + float* work, + lapack_int* info ); + +#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr,ZBDSQR) +void LAPACK_zbdsqr( + char const* uplo, + lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc, + double* D, + double* E, + lapack_complex_double* VT, lapack_int const* ldvt, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* C, lapack_int const* ldc, + double* rwork, + lapack_int* info ); + +#define LAPACK_dbdsvdx LAPACK_GLOBAL(dbdsvdx,DBDSVDX) +void LAPACK_dbdsvdx( + char const* uplo, char const* jobz, char const* range, + lapack_int const* n, + double const* D, + double const* E, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns, + double* S, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sbdsvdx LAPACK_GLOBAL(sbdsvdx,SBDSVDX) +void LAPACK_sbdsvdx( + char const* uplo, char const* jobz, char const* range, + lapack_int const* n, + float const* D, + float const* E, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns, + float* S, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ddisna LAPACK_GLOBAL(ddisna,DDISNA) +void LAPACK_ddisna( + char const* job, + lapack_int const* m, lapack_int const* n, + double const* D, + double* SEP, + lapack_int* info ); + +#define LAPACK_sdisna LAPACK_GLOBAL(sdisna,SDISNA) +void LAPACK_sdisna( + char const* job, + lapack_int const* m, lapack_int const* n, + float const* D, + float* SEP, + lapack_int* info ); + +#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd,CGBBRD) +void LAPACK_cgbbrd( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku, + lapack_complex_float* AB, lapack_int const* ldab, + float* D, + float* E, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* PT, lapack_int const* ldpt, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd,DGBBRD) +void LAPACK_dgbbrd( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku, + double* AB, lapack_int const* ldab, + double* D, + double* E, + double* Q, lapack_int const* ldq, + double* PT, lapack_int const* ldpt, + double* C, lapack_int const* ldc, + double* work, + lapack_int* info ); + +#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd,SGBBRD) +void LAPACK_sgbbrd( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku, + float* AB, lapack_int const* ldab, + float* D, + float* E, + float* Q, lapack_int const* ldq, + float* PT, lapack_int const* ldpt, + float* C, lapack_int const* ldc, + float* work, + lapack_int* info ); + +#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd,ZGBBRD) +void LAPACK_zgbbrd( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku, + lapack_complex_double* AB, lapack_int const* ldab, + double* D, + double* E, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* PT, lapack_int const* ldpt, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon,CGBCON) +void LAPACK_cgbcon( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_float const* AB, lapack_int const* ldab, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon,DGBCON) +void LAPACK_dgbcon( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + double const* AB, lapack_int const* ldab, lapack_int const* ipiv, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon,SGBCON) +void LAPACK_sgbcon( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + float const* AB, lapack_int const* ldab, lapack_int const* ipiv, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon,ZGBCON) +void LAPACK_zgbcon( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_double const* AB, lapack_int const* ldab, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ,CGBEQU) +void LAPACK_cgbequ( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_float const* AB, lapack_int const* ldab, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ,DGBEQU) +void LAPACK_dgbequ( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + double const* AB, lapack_int const* ldab, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ,SGBEQU) +void LAPACK_sgbequ( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + float const* AB, lapack_int const* ldab, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ,ZGBEQU) +void LAPACK_zgbequ( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_double const* AB, lapack_int const* ldab, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb,CGBEQUB) +void LAPACK_cgbequb( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_float const* AB, lapack_int const* ldab, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb,DGBEQUB) +void LAPACK_dgbequb( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + double const* AB, lapack_int const* ldab, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb,SGBEQUB) +void LAPACK_sgbequb( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + float const* AB, lapack_int const* ldab, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb,ZGBEQUB) +void LAPACK_zgbequb( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_double const* AB, lapack_int const* ldab, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs,CGBRFS) +void LAPACK_cgbrfs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_float const* AB, lapack_int const* ldab, + lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs,DGBRFS) +void LAPACK_dgbrfs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + double const* AB, lapack_int const* ldab, + double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs,SGBRFS) +void LAPACK_sgbrfs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + float const* AB, lapack_int const* ldab, + float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs,ZGBRFS) +void LAPACK_zgbrfs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_double const* AB, lapack_int const* ldab, + lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx,CGBRFSX) +void LAPACK_cgbrfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_float const* AB, lapack_int const* ldab, + lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + float* R, + float* C, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx,DGBRFSX) +void LAPACK_dgbrfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + double const* AB, lapack_int const* ldab, + double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + double* R, + double* C, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx,SGBRFSX) +void LAPACK_sgbrfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + float const* AB, lapack_int const* ldab, + float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + float* R, + float* C, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx,ZGBRFSX) +void LAPACK_zgbrfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_double const* AB, lapack_int const* ldab, + lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, + double* R, + double* C, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv,CGBSV) +void LAPACK_cgbsv( + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_float* AB, lapack_int const* ldab, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv,DGBSV) +void LAPACK_dgbsv( + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + double* AB, lapack_int const* ldab, lapack_int* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv,SGBSV) +void LAPACK_sgbsv( + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + float* AB, lapack_int const* ldab, lapack_int* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv,ZGBSV) +void LAPACK_zgbsv( + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_double* AB, lapack_int const* ldab, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx,CGBSVX) +void LAPACK_cgbsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + float* R, + float* C, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx,DGBSVX) +void LAPACK_dgbsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + double* AB, lapack_int const* ldab, + double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + double* R, + double* C, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx,SGBSVX) +void LAPACK_sgbsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + float* AB, lapack_int const* ldab, + float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + float* R, + float* C, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx,ZGBSVX) +void LAPACK_zgbsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + double* R, + double* C, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx,CGBSVXX) +void LAPACK_cgbsvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + float* R, + float* C, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx,DGBSVXX) +void LAPACK_dgbsvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + double* AB, lapack_int const* ldab, + double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + double* R, + double* C, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx,SGBSVXX) +void LAPACK_sgbsvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + float* AB, lapack_int const* ldab, + float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + float* R, + float* C, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx,ZGBSVXX) +void LAPACK_zgbsvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed, + double* R, + double* C, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf,CGBTRF) +void LAPACK_cgbtrf( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_float* AB, lapack_int const* ldab, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf,DGBTRF) +void LAPACK_dgbtrf( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + double* AB, lapack_int const* ldab, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf,SGBTRF) +void LAPACK_sgbtrf( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + float* AB, lapack_int const* ldab, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf,ZGBTRF) +void LAPACK_zgbtrf( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_double* AB, lapack_int const* ldab, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs,CGBTRS) +void LAPACK_cgbtrs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_float const* AB, lapack_int const* ldab, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs,DGBTRS) +void LAPACK_dgbtrs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + double const* AB, lapack_int const* ldab, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs,SGBTRS) +void LAPACK_sgbtrs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + float const* AB, lapack_int const* ldab, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs,ZGBTRS) +void LAPACK_zgbtrs( + char const* trans, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, + lapack_complex_double const* AB, lapack_int const* ldab, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cgebak LAPACK_GLOBAL(cgebak,CGEBAK) +void LAPACK_cgebak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float const* scale, lapack_int const* m, + lapack_complex_float* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_dgebak LAPACK_GLOBAL(dgebak,DGEBAK) +void LAPACK_dgebak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double const* scale, lapack_int const* m, + double* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_sgebak LAPACK_GLOBAL(sgebak,SGEBAK) +void LAPACK_sgebak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float const* scale, lapack_int const* m, + float* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_zgebak LAPACK_GLOBAL(zgebak,ZGEBAK) +void LAPACK_zgebak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double const* scale, lapack_int const* m, + lapack_complex_double* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_cgebal LAPACK_GLOBAL(cgebal,CGEBAL) +void LAPACK_cgebal( + char const* job, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi, + float* scale, + lapack_int* info ); + +#define LAPACK_dgebal LAPACK_GLOBAL(dgebal,DGEBAL) +void LAPACK_dgebal( + char const* job, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi, + double* scale, + lapack_int* info ); + +#define LAPACK_sgebal LAPACK_GLOBAL(sgebal,SGEBAL) +void LAPACK_sgebal( + char const* job, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi, + float* scale, + lapack_int* info ); + +#define LAPACK_zgebal LAPACK_GLOBAL(zgebal,ZGEBAL) +void LAPACK_zgebal( + char const* job, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi, + double* scale, + lapack_int* info ); + +#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd,CGEBRD) +void LAPACK_cgebrd( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* D, + float* E, + lapack_complex_float* tauq, + lapack_complex_float* taup, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd,DGEBRD) +void LAPACK_dgebrd( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* D, + double* E, + double* tauq, + double* taup, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd,SGEBRD) +void LAPACK_sgebrd( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* D, + float* E, + float* tauq, + float* taup, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd,ZGEBRD) +void LAPACK_zgebrd( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* D, + double* E, + lapack_complex_double* tauq, + lapack_complex_double* taup, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgecon LAPACK_GLOBAL(cgecon,CGECON) +void LAPACK_cgecon( + char const* norm, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float const* anorm, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgecon LAPACK_GLOBAL(dgecon,DGECON) +void LAPACK_dgecon( + char const* norm, + lapack_int const* n, + double const* A, lapack_int const* lda, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgecon LAPACK_GLOBAL(sgecon,SGECON) +void LAPACK_sgecon( + char const* norm, + lapack_int const* n, + float const* A, lapack_int const* lda, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgecon LAPACK_GLOBAL(zgecon,ZGECON) +void LAPACK_zgecon( + char const* norm, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double const* anorm, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ,CGEEQU) +void LAPACK_cgeequ( + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ,DGEEQU) +void LAPACK_dgeequ( + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ,SGEEQU) +void LAPACK_sgeequ( + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ,ZGEEQU) +void LAPACK_zgeequ( + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb,CGEEQUB) +void LAPACK_cgeequb( + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb,DGEEQUB) +void LAPACK_dgeequb( + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb,SGEEQUB) +void LAPACK_sgeequb( + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float* R, + float* C, + float* rowcnd, + float* colcnd, + float* amax, + lapack_int* info ); + +#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb,ZGEEQUB) +void LAPACK_zgeequb( + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* R, + double* C, + double* rowcnd, + double* colcnd, + double* amax, + lapack_int* info ); + +#define LAPACK_cgees LAPACK_GLOBAL(cgees,CGEES) +void LAPACK_cgees( + char const* jobvs, char const* sort, LAPACK_C_SELECT1 select, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* sdim, + lapack_complex_float* W, + lapack_complex_float* VS, lapack_int const* ldvs, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_dgees LAPACK_GLOBAL(dgees,DGEES) +void LAPACK_dgees( + char const* jobvs, char const* sort, LAPACK_D_SELECT2 select, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* sdim, + double* WR, + double* WI, + double* VS, lapack_int const* ldvs, + double* work, lapack_int const* lwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_sgees LAPACK_GLOBAL(sgees,SGEES) +void LAPACK_sgees( + char const* jobvs, char const* sort, LAPACK_S_SELECT2 select, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* sdim, + float* WR, + float* WI, + float* VS, lapack_int const* ldvs, + float* work, lapack_int const* lwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_zgees LAPACK_GLOBAL(zgees,ZGEES) +void LAPACK_zgees( + char const* jobvs, char const* sort, LAPACK_Z_SELECT1 select, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* sdim, + lapack_complex_double* W, + lapack_complex_double* VS, lapack_int const* ldvs, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx,CGEESX) +void LAPACK_cgeesx( + char const* jobvs, char const* sort, LAPACK_C_SELECT1 select, char const* sense, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* sdim, + lapack_complex_float* W, + lapack_complex_float* VS, lapack_int const* ldvs, + float* rconde, + float* rcondv, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx,DGEESX) +void LAPACK_dgeesx( + char const* jobvs, char const* sort, LAPACK_D_SELECT2 select, char const* sense, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* sdim, + double* WR, + double* WI, + double* VS, lapack_int const* ldvs, + double* rconde, + double* rcondv, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx,SGEESX) +void LAPACK_sgeesx( + char const* jobvs, char const* sort, LAPACK_S_SELECT2 select, char const* sense, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* sdim, + float* WR, + float* WI, + float* VS, lapack_int const* ldvs, + float* rconde, + float* rcondv, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx,ZGEESX) +void LAPACK_zgeesx( + char const* jobvs, char const* sort, LAPACK_Z_SELECT1 select, char const* sense, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* sdim, + lapack_complex_double* W, + lapack_complex_double* VS, lapack_int const* ldvs, + double* rconde, + double* rcondv, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_cgeev LAPACK_GLOBAL(cgeev,CGEEV) +void LAPACK_cgeev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* W, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgeev LAPACK_GLOBAL(dgeev,DGEEV) +void LAPACK_dgeev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + double* A, lapack_int const* lda, + double* WR, + double* WI, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgeev LAPACK_GLOBAL(sgeev,SGEEV) +void LAPACK_sgeev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + float* A, lapack_int const* lda, + float* WR, + float* WI, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgeev LAPACK_GLOBAL(zgeev,ZGEEV) +void LAPACK_zgeev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* W, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx,CGEEVX) +void LAPACK_cgeevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* W, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + float* scale, + float* abnrm, + float* rconde, + float* rcondv, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx,DGEEVX) +void LAPACK_dgeevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + double* A, lapack_int const* lda, + double* WR, + double* WI, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + double* scale, + double* abnrm, + double* rconde, + double* rcondv, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx,SGEEVX) +void LAPACK_sgeevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + float* A, lapack_int const* lda, + float* WR, + float* WI, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + float* scale, + float* abnrm, + float* rconde, + float* rcondv, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx,ZGEEVX) +void LAPACK_zgeevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* W, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + double* scale, + double* abnrm, + double* rconde, + double* rcondv, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd,CGEHRD) +void LAPACK_cgehrd( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd,DGEHRD) +void LAPACK_dgehrd( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double* A, lapack_int const* lda, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd,SGEHRD) +void LAPACK_sgehrd( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float* A, lapack_int const* lda, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd,ZGEHRD) +void LAPACK_zgehrd( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgejsv LAPACK_GLOBAL(cgejsv,CGEJSV) +void LAPACK_cgejsv( + char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp, + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* SVA, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* V, lapack_int const* ldv, + lapack_complex_float* cwork, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv,DGEJSV) +void LAPACK_dgejsv( + char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp, + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* SVA, + double* U, lapack_int const* ldu, + double* V, lapack_int const* ldv, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv,SGEJSV) +void LAPACK_sgejsv( + char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp, + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* SVA, + float* U, lapack_int const* ldu, + float* V, lapack_int const* ldv, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgejsv LAPACK_GLOBAL(zgejsv,ZGEJSV) +void LAPACK_zgejsv( + char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp, + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* SVA, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* V, lapack_int const* ldv, + lapack_complex_double* cwork, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cgelq LAPACK_GLOBAL(cgelq,CGELQ) +void LAPACK_cgelq( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* tsize, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgelq LAPACK_GLOBAL(dgelq,DGELQ) +void LAPACK_dgelq( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* T, lapack_int const* tsize, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgelq LAPACK_GLOBAL(sgelq,SGELQ) +void LAPACK_sgelq( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* T, lapack_int const* tsize, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgelq LAPACK_GLOBAL(zgelq,ZGELQ) +void LAPACK_zgelq( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* tsize, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2,CGELQ2) +void LAPACK_cgelq2( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2,DGELQ2) +void LAPACK_dgelq2( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, + lapack_int* info ); + +#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2,SGELQ2) +void LAPACK_sgelq2( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, + lapack_int* info ); + +#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2,ZGELQ2) +void LAPACK_zgelq2( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf,CGELQF) +void LAPACK_cgelqf( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf,DGELQF) +void LAPACK_dgelqf( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf,SGELQF) +void LAPACK_sgelqf( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf,ZGELQF) +void LAPACK_zgelqf( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgels LAPACK_GLOBAL(cgels,CGELS) +void LAPACK_cgels( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgels LAPACK_GLOBAL(dgels,DGELS) +void LAPACK_dgels( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgels LAPACK_GLOBAL(sgels,SGELS) +void LAPACK_sgels( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgels LAPACK_GLOBAL(zgels,ZGELS) +void LAPACK_zgels( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd,CGELSD) +void LAPACK_cgelsd( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float* S, + float const* rcond, lapack_int* rank, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd,DGELSD) +void LAPACK_dgelsd( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* S, + double const* rcond, lapack_int* rank, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd,SGELSD) +void LAPACK_sgelsd( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* S, + float const* rcond, lapack_int* rank, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd,ZGELSD) +void LAPACK_zgelsd( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double* S, + double const* rcond, lapack_int* rank, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cgelss LAPACK_GLOBAL(cgelss,CGELSS) +void LAPACK_cgelss( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float* S, + float const* rcond, lapack_int* rank, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgelss LAPACK_GLOBAL(dgelss,DGELSS) +void LAPACK_dgelss( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* S, + double const* rcond, lapack_int* rank, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgelss LAPACK_GLOBAL(sgelss,SGELSS) +void LAPACK_sgelss( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* S, + float const* rcond, lapack_int* rank, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgelss LAPACK_GLOBAL(zgelss,ZGELSS) +void LAPACK_zgelss( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double* S, + double const* rcond, lapack_int* rank, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy,CGELSY) +void LAPACK_cgelsy( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, lapack_int* JPVT, + float const* rcond, lapack_int* rank, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy,DGELSY) +void LAPACK_dgelsy( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, lapack_int* JPVT, + double const* rcond, lapack_int* rank, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy,SGELSY) +void LAPACK_sgelsy( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, lapack_int* JPVT, + float const* rcond, lapack_int* rank, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy,ZGELSY) +void LAPACK_zgelsy( + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, lapack_int* JPVT, + double const* rcond, lapack_int* rank, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgemlq LAPACK_GLOBAL(cgemlq,CGEMLQ) +void LAPACK_cgemlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* T, lapack_int const* tsize, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgemlq LAPACK_GLOBAL(dgemlq,DGEMLQ) +void LAPACK_dgemlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* A, lapack_int const* lda, + double const* T, lapack_int const* tsize, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgemlq LAPACK_GLOBAL(sgemlq,SGEMLQ) +void LAPACK_sgemlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* A, lapack_int const* lda, + float const* T, lapack_int const* tsize, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgemlq LAPACK_GLOBAL(zgemlq,ZGEMLQ) +void LAPACK_zgemlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* T, lapack_int const* tsize, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgemqr LAPACK_GLOBAL(cgemqr,CGEMQR) +void LAPACK_cgemqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* T, lapack_int const* tsize, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgemqr LAPACK_GLOBAL(dgemqr,DGEMQR) +void LAPACK_dgemqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* A, lapack_int const* lda, + double const* T, lapack_int const* tsize, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgemqr LAPACK_GLOBAL(sgemqr,SGEMQR) +void LAPACK_sgemqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* A, lapack_int const* lda, + float const* T, lapack_int const* tsize, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgemqr LAPACK_GLOBAL(zgemqr,ZGEMQR) +void LAPACK_zgemqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* T, lapack_int const* tsize, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt,CGEMQRT) +void LAPACK_cgemqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb, + lapack_complex_float const* V, lapack_int const* ldv, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt,DGEMQRT) +void LAPACK_dgemqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb, + double const* V, lapack_int const* ldv, + double const* T, lapack_int const* ldt, + double* C, lapack_int const* ldc, + double* work, + lapack_int* info ); + +#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt,SGEMQRT) +void LAPACK_sgemqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb, + float const* V, lapack_int const* ldv, + float const* T, lapack_int const* ldt, + float* C, lapack_int const* ldc, + float* work, + lapack_int* info ); + +#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt,ZGEMQRT) +void LAPACK_zgemqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb, + lapack_complex_double const* V, lapack_int const* ldv, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cgeql2 LAPACK_GLOBAL(cgeql2,CGEQL2) +void LAPACK_cgeql2( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dgeql2 LAPACK_GLOBAL(dgeql2,DGEQL2) +void LAPACK_dgeql2( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, + lapack_int* info ); + +#define LAPACK_sgeql2 LAPACK_GLOBAL(sgeql2,SGEQL2) +void LAPACK_sgeql2( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, + lapack_int* info ); + +#define LAPACK_zgeql2 LAPACK_GLOBAL(zgeql2,ZGEQL2) +void LAPACK_zgeql2( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf,CGEQLF) +void LAPACK_cgeqlf( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf,DGEQLF) +void LAPACK_dgeqlf( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf,SGEQLF) +void LAPACK_sgeqlf( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf,ZGEQLF) +void LAPACK_zgeqlf( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf,SGEQPF) +void LAPACK_sgeqpf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, + lapack_int* jpvt, float* tau, float* work, + lapack_int *info ); + +#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf,DGEQPF) +void LAPACK_dgeqpf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, + lapack_int* jpvt, double* tau, double* work, + lapack_int *info ); + +#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf,CGEQPF) +void LAPACK_cgeqpf( lapack_int* m, lapack_int* n, lapack_complex_float* a, + lapack_int* lda, lapack_int* jpvt, + lapack_complex_float* tau, lapack_complex_float* work, + float* rwork, lapack_int *info ); + +#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf,ZGEQPF) +void LAPACK_zgeqpf( lapack_int* m, lapack_int* n, lapack_complex_double* a, + lapack_int* lda, lapack_int* jpvt, + lapack_complex_double* tau, lapack_complex_double* work, + double* rwork, lapack_int *info ); + +#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3,CGEQP3) +void LAPACK_cgeqp3( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* JPVT, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3,DGEQP3) +void LAPACK_dgeqp3( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* JPVT, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3,SGEQP3) +void LAPACK_sgeqp3( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* JPVT, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3,ZGEQP3) +void LAPACK_zgeqp3( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* JPVT, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgeqr LAPACK_GLOBAL(cgeqr,CGEQR) +void LAPACK_cgeqr( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* tsize, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgeqr LAPACK_GLOBAL(dgeqr,DGEQR) +void LAPACK_dgeqr( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* T, lapack_int const* tsize, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgeqr LAPACK_GLOBAL(sgeqr,SGEQR) +void LAPACK_sgeqr( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* T, lapack_int const* tsize, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgeqr LAPACK_GLOBAL(zgeqr,ZGEQR) +void LAPACK_zgeqr( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* tsize, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2,CGEQR2) +void LAPACK_cgeqr2( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2,DGEQR2) +void LAPACK_dgeqr2( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, + lapack_int* info ); + +#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2,SGEQR2) +void LAPACK_sgeqr2( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, + lapack_int* info ); + +#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2,ZGEQR2) +void LAPACK_zgeqr2( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf,CGEQRF) +void LAPACK_cgeqrf( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf,DGEQRF) +void LAPACK_dgeqrf( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf,SGEQRF) +void LAPACK_sgeqrf( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf,ZGEQRF) +void LAPACK_zgeqrf( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp,CGEQRFP) +void LAPACK_cgeqrfp( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp,DGEQRFP) +void LAPACK_dgeqrfp( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp,SGEQRFP) +void LAPACK_sgeqrfp( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp,ZGEQRFP) +void LAPACK_zgeqrfp( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt,CGEQRT) +void LAPACK_cgeqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* nb, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt,DGEQRT) +void LAPACK_dgeqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* nb, + double* A, lapack_int const* lda, + double* T, lapack_int const* ldt, + double* work, + lapack_int* info ); + +#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt,SGEQRT) +void LAPACK_sgeqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* nb, + float* A, lapack_int const* lda, + float* T, lapack_int const* ldt, + float* work, + lapack_int* info ); + +#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt,ZGEQRT) +void LAPACK_zgeqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* nb, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2,CGEQRT2) +void LAPACK_cgeqrt2( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2,DGEQRT2) +void LAPACK_dgeqrt2( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2,SGEQRT2) +void LAPACK_sgeqrt2( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2,ZGEQRT2) +void LAPACK_zgeqrt2( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3,CGEQRT3) +void LAPACK_cgeqrt3( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3,DGEQRT3) +void LAPACK_dgeqrt3( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3,SGEQRT3) +void LAPACK_sgeqrt3( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3,ZGEQRT3) +void LAPACK_zgeqrt3( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs,CGERFS) +void LAPACK_cgerfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs,DGERFS) +void LAPACK_dgerfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs,SGERFS) +void LAPACK_sgerfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs,ZGERFS) +void LAPACK_zgerfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx,CGERFSX) +void LAPACK_cgerfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + float const* R, + float const* C, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx,DGERFSX) +void LAPACK_dgerfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + double const* R, + double const* C, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx,SGERFSX) +void LAPACK_sgerfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + float const* R, + float const* C, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx,ZGERFSX) +void LAPACK_zgerfsx( + char const* trans, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + double const* R, + double const* C, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgerq2 LAPACK_GLOBAL(cgerq2,CGERQ2) +void LAPACK_cgerq2( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dgerq2 LAPACK_GLOBAL(dgerq2,DGERQ2) +void LAPACK_dgerq2( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, + lapack_int* info ); + +#define LAPACK_sgerq2 LAPACK_GLOBAL(sgerq2,SGERQ2) +void LAPACK_sgerq2( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, + lapack_int* info ); + +#define LAPACK_zgerq2 LAPACK_GLOBAL(zgerq2,ZGERQ2) +void LAPACK_zgerq2( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf,CGERQF) +void LAPACK_cgerqf( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf,DGERQF) +void LAPACK_dgerqf( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf,SGERQF) +void LAPACK_sgerqf( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf,ZGERQF) +void LAPACK_zgerqf( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd,CGESDD) +void LAPACK_cgesdd( + char const* jobz, + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* S, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* VT, lapack_int const* ldvt, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd,DGESDD) +void LAPACK_dgesdd( + char const* jobz, + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* S, + double* U, lapack_int const* ldu, + double* VT, lapack_int const* ldvt, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd,SGESDD) +void LAPACK_sgesdd( + char const* jobz, + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* S, + float* U, lapack_int const* ldu, + float* VT, lapack_int const* ldvt, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd,ZGESDD) +void LAPACK_zgesdd( + char const* jobz, + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* S, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* VT, lapack_int const* ldvt, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV) +void LAPACK_cgesv( + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV) +void LAPACK_dgesv( + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, lapack_int* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV) +void LAPACK_sgesv( + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, lapack_int* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV) +void LAPACK_zgesv( + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv,DSGESV) +void LAPACK_dsgesv( + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, lapack_int* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* work, + float* swork, lapack_int* iter, + lapack_int* info ); + +#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv,ZCGESV) +void LAPACK_zcgesv( + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + lapack_complex_double* work, + lapack_complex_float* swork, + double* rwork, lapack_int* iter, + lapack_int* info ); + +#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd,CGESVD) +void LAPACK_cgesvd( + char const* jobu, char const* jobvt, + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* S, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* VT, lapack_int const* ldvt, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd,DGESVD) +void LAPACK_dgesvd( + char const* jobu, char const* jobvt, + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* S, + double* U, lapack_int const* ldu, + double* VT, lapack_int const* ldvt, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd,SGESVD) +void LAPACK_sgesvd( + char const* jobu, char const* jobvt, + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* S, + float* U, lapack_int const* ldu, + float* VT, lapack_int const* ldvt, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd,ZGESVD) +void LAPACK_zgesvd( + char const* jobu, char const* jobvt, + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* S, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* VT, lapack_int const* ldvt, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgesvdq LAPACK_GLOBAL(cgesvdq,CGESVDQ) +void LAPACK_cgesvdq( + char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* S, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* V, lapack_int const* ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int const* liwork, + lapack_complex_float* cwork, lapack_int* lcwork, + float* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_dgesvdq LAPACK_GLOBAL(dgesvdq,DGESVDQ) +void LAPACK_dgesvdq( + char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* S, + double* U, lapack_int const* ldu, + double* V, lapack_int const* ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int const* liwork, + double* work, lapack_int* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_sgesvdq LAPACK_GLOBAL(sgesvdq,SGESVDQ) +void LAPACK_sgesvdq( + char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* S, + float* U, lapack_int const* ldu, + float* V, lapack_int const* ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int const* liwork, + float* work, lapack_int* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_zgesvdq LAPACK_GLOBAL(zgesvdq,ZGESVDQ) +void LAPACK_zgesvdq( + char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* S, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* V, lapack_int const* ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int const* liwork, + lapack_complex_float* cwork, lapack_int* lcwork, + double* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_cgesvdx LAPACK_GLOBAL(cgesvdx,CGESVDX) +void LAPACK_cgesvdx( + char const* jobu, char const* jobvt, char const* range, + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns, + float* S, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* VT, lapack_int const* ldvt, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dgesvdx LAPACK_GLOBAL(dgesvdx,DGESVDX) +void LAPACK_dgesvdx( + char const* jobu, char const* jobvt, char const* range, + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns, + double* S, + double* U, lapack_int const* ldu, + double* VT, lapack_int const* ldvt, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgesvdx LAPACK_GLOBAL(sgesvdx,SGESVDX) +void LAPACK_sgesvdx( + char const* jobu, char const* jobvt, char const* range, + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns, + float* S, + float* U, lapack_int const* ldu, + float* VT, lapack_int const* ldvt, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgesvdx LAPACK_GLOBAL(zgesvdx,ZGESVDX) +void LAPACK_zgesvdx( + char const* jobu, char const* jobvt, char const* range, + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns, + double* S, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* VT, lapack_int const* ldvt, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cgesvj LAPACK_GLOBAL(cgesvj,CGESVJ) +void LAPACK_cgesvj( + char const* joba, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* SVA, lapack_int const* mv, + lapack_complex_float* V, lapack_int const* ldv, + lapack_complex_float* cwork, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj,DGESVJ) +void LAPACK_dgesvj( + char const* joba, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* SVA, lapack_int const* mv, + double* V, lapack_int const* ldv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj,SGESVJ) +void LAPACK_sgesvj( + char const* joba, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* SVA, lapack_int const* mv, + float* V, lapack_int const* ldv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgesvj LAPACK_GLOBAL(zgesvj,ZGESVJ) +void LAPACK_zgesvj( + char const* joba, char const* jobu, char const* jobv, + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* SVA, lapack_int const* mv, + lapack_complex_double* V, lapack_int const* ldv, + lapack_complex_double* cwork, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx,CGESVX) +void LAPACK_cgesvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + float* R, + float* C, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx,DGESVX) +void LAPACK_dgesvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + double* R, + double* C, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx,SGESVX) +void LAPACK_sgesvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + float* R, + float* C, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx,ZGESVX) +void LAPACK_zgesvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + double* R, + double* C, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx,CGESVXX) +void LAPACK_cgesvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + float* R, + float* C, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx,DGESVXX) +void LAPACK_dgesvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + double* R, + double* C, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx,SGESVXX) +void LAPACK_sgesvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + float* R, + float* C, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx,ZGESVXX) +void LAPACK_zgesvxx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + double* R, + double* C, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2) +void LAPACK_cgetf2( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2) +void LAPACK_dgetf2( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2) +void LAPACK_sgetf2( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2) +void LAPACK_zgetf2( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF) +void LAPACK_cgetrf( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF) +void LAPACK_dgetrf( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF) +void LAPACK_sgetrf( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF) +void LAPACK_zgetrf( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_cgetrf2 LAPACK_GLOBAL(cgetrf2,CGETRF2) +void LAPACK_cgetrf2( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_dgetrf2 LAPACK_GLOBAL(dgetrf2,DGETRF2) +void LAPACK_dgetrf2( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_sgetrf2 LAPACK_GLOBAL(sgetrf2,SGETRF2) +void LAPACK_sgetrf2( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_zgetrf2 LAPACK_GLOBAL(zgetrf2,ZGETRF2) +void LAPACK_zgetrf2( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_cgetri LAPACK_GLOBAL(cgetri,CGETRI) +void LAPACK_cgetri( + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgetri LAPACK_GLOBAL(dgetri,DGETRI) +void LAPACK_dgetri( + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int const* ipiv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgetri LAPACK_GLOBAL(sgetri,SGETRI) +void LAPACK_sgetri( + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int const* ipiv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgetri LAPACK_GLOBAL(zgetri,ZGETRI) +void LAPACK_zgetri( + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs,CGETRS) +void LAPACK_cgetrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs,DGETRS) +void LAPACK_dgetrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs,SGETRS) +void LAPACK_sgetrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs,ZGETRS) +void LAPACK_zgetrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cgetsls LAPACK_GLOBAL(cgetsls,CGETSLS) +void LAPACK_cgetsls( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgetsls LAPACK_GLOBAL(dgetsls,DGETSLS) +void LAPACK_dgetsls( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgetsls LAPACK_GLOBAL(sgetsls,SGETSLS) +void LAPACK_sgetsls( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgetsls LAPACK_GLOBAL(zgetsls,ZGETSLS) +void LAPACK_zgetsls( + char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK) +void LAPACK_cggbak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float const* lscale, + float const* rscale, lapack_int const* m, + lapack_complex_float* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_dggbak LAPACK_GLOBAL(dggbak,DGGBAK) +void LAPACK_dggbak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double const* lscale, + double const* rscale, lapack_int const* m, + double* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_sggbak LAPACK_GLOBAL(sggbak,SGGBAK) +void LAPACK_sggbak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float const* lscale, + float const* rscale, lapack_int const* m, + float* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_zggbak LAPACK_GLOBAL(zggbak,ZGGBAK) +void LAPACK_zggbak( + char const* job, char const* side, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double const* lscale, + double const* rscale, lapack_int const* m, + lapack_complex_double* V, lapack_int const* ldv, + lapack_int* info ); + +#define LAPACK_cggbal LAPACK_GLOBAL(cggbal,CGGBAL) +void LAPACK_cggbal( + char const* job, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi, + float* lscale, + float* rscale, + float* work, + lapack_int* info ); + +#define LAPACK_dggbal LAPACK_GLOBAL(dggbal,DGGBAL) +void LAPACK_dggbal( + char const* job, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi, + double* lscale, + double* rscale, + double* work, + lapack_int* info ); + +#define LAPACK_sggbal LAPACK_GLOBAL(sggbal,SGGBAL) +void LAPACK_sggbal( + char const* job, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi, + float* lscale, + float* rscale, + float* work, + lapack_int* info ); + +#define LAPACK_zggbal LAPACK_GLOBAL(zggbal,ZGGBAL) +void LAPACK_zggbal( + char const* job, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi, + double* lscale, + double* rscale, + double* work, + lapack_int* info ); + +#define LAPACK_cgges LAPACK_GLOBAL(cgges,CGGES) +void LAPACK_cgges( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_C_SELECT2 selctg, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, lapack_int* sdim, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* VSL, lapack_int const* ldvsl, + lapack_complex_float* VSR, lapack_int const* ldvsr, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_dgges LAPACK_GLOBAL(dgges,DGGES) +void LAPACK_dgges( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_D_SELECT3 selctg, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, lapack_int* sdim, + double* alphar, + double* alphai, + double* beta, + double* VSL, lapack_int const* ldvsl, + double* VSR, lapack_int const* ldvsr, + double* work, lapack_int const* lwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_sgges LAPACK_GLOBAL(sgges,SGGES) +void LAPACK_sgges( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_S_SELECT3 selctg, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, lapack_int* sdim, + float* alphar, + float* alphai, + float* beta, + float* VSL, lapack_int const* ldvsl, + float* VSR, lapack_int const* ldvsr, + float* work, lapack_int const* lwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_zgges LAPACK_GLOBAL(zgges,ZGGES) +void LAPACK_zgges( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_Z_SELECT2 selctg, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, lapack_int* sdim, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* VSL, lapack_int const* ldvsl, + lapack_complex_double* VSR, lapack_int const* ldvsr, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_cgges3 LAPACK_GLOBAL(cgges3,CGGES3) +void LAPACK_cgges3( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_C_SELECT2 selctg, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, lapack_int* sdim, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* VSL, lapack_int const* ldvsl, + lapack_complex_float* VSR, lapack_int const* ldvsr, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_dgges3 LAPACK_GLOBAL(dgges3,DGGES3) +void LAPACK_dgges3( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_D_SELECT3 selctg, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, lapack_int* sdim, + double* alphar, + double* alphai, + double* beta, + double* VSL, lapack_int const* ldvsl, + double* VSR, lapack_int const* ldvsr, + double* work, lapack_int const* lwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_sgges3 LAPACK_GLOBAL(sgges3,SGGES3) +void LAPACK_sgges3( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_S_SELECT3 selctg, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, lapack_int* sdim, + float* alphar, + float* alphai, + float* beta, + float* VSL, lapack_int const* ldvsl, + float* VSR, lapack_int const* ldvsr, + float* work, lapack_int const* lwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_zgges3 LAPACK_GLOBAL(zgges3,ZGGES3) +void LAPACK_zgges3( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_Z_SELECT2 selctg, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, lapack_int* sdim, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* VSL, lapack_int const* ldvsl, + lapack_complex_double* VSR, lapack_int const* ldvsr, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_cggesx LAPACK_GLOBAL(cggesx,CGGESX) +void LAPACK_cggesx( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_C_SELECT2 selctg, char const* sense, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, lapack_int* sdim, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* VSL, lapack_int const* ldvsl, + lapack_complex_float* VSR, lapack_int const* ldvsr, + float* rconde, + float* rcondv, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_dggesx LAPACK_GLOBAL(dggesx,DGGESX) +void LAPACK_dggesx( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_D_SELECT3 selctg, char const* sense, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, lapack_int* sdim, + double* alphar, + double* alphai, + double* beta, + double* VSL, lapack_int const* ldvsl, + double* VSR, lapack_int const* ldvsr, + double* rconde, + double* rcondv, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_sggesx LAPACK_GLOBAL(sggesx,SGGESX) +void LAPACK_sggesx( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_S_SELECT3 selctg, char const* sense, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, lapack_int* sdim, + float* alphar, + float* alphai, + float* beta, + float* VSL, lapack_int const* ldvsl, + float* VSR, lapack_int const* ldvsr, + float* rconde, + float* rcondv, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_zggesx LAPACK_GLOBAL(zggesx,ZGGESX) +void LAPACK_zggesx( + char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_Z_SELECT2 selctg, char const* sense, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, lapack_int* sdim, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* VSL, lapack_int const* ldvsl, + lapack_complex_double* VSR, lapack_int const* ldvsr, + double* rconde, + double* rcondv, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_cggev LAPACK_GLOBAL(cggev,CGGEV) +void LAPACK_cggev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dggev LAPACK_GLOBAL(dggev,DGGEV) +void LAPACK_dggev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* alphar, + double* alphai, + double* beta, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sggev LAPACK_GLOBAL(sggev,SGGEV) +void LAPACK_sggev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* alphar, + float* alphai, + float* beta, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zggev LAPACK_GLOBAL(zggev,ZGGEV) +void LAPACK_zggev( + char const* jobvl, char const* jobvr, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cggev3 LAPACK_GLOBAL(cggev3,CGGEV3) +void LAPACK_cggev3( + char const* jobvl, char const* jobvr, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dggev3 LAPACK_GLOBAL(dggev3,DGGEV3) +void LAPACK_dggev3( + char const* jobvl, char const* jobvr, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* alphar, + double* alphai, + double* beta, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sggev3 LAPACK_GLOBAL(sggev3,SGGEV3) +void LAPACK_sggev3( + char const* jobvl, char const* jobvr, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* alphar, + float* alphai, + float* beta, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zggev3 LAPACK_GLOBAL(zggev3,ZGGEV3) +void LAPACK_zggev3( + char const* jobvl, char const* jobvr, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cggevx LAPACK_GLOBAL(cggevx,CGGEVX) +void LAPACK_cggevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + float* lscale, + float* rscale, + float* abnrm, + float* bbnrm, + float* rconde, + float* rcondv, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_dggevx LAPACK_GLOBAL(dggevx,DGGEVX) +void LAPACK_dggevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* alphar, + double* alphai, + double* beta, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + double* lscale, + double* rscale, + double* abnrm, + double* bbnrm, + double* rconde, + double* rcondv, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_sggevx LAPACK_GLOBAL(sggevx,SGGEVX) +void LAPACK_sggevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* alphar, + float* alphai, + float* beta, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + float* lscale, + float* rscale, + float* abnrm, + float* bbnrm, + float* rconde, + float* rcondv, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_zggevx LAPACK_GLOBAL(zggevx,ZGGEVX) +void LAPACK_zggevx( + char const* balanc, char const* jobvl, char const* jobvr, char const* sense, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi, + double* lscale, + double* rscale, + double* abnrm, + double* bbnrm, + double* rconde, + double* rcondv, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, lapack_logical* BWORK, + lapack_int* info ); + +#define LAPACK_cggglm LAPACK_GLOBAL(cggglm,CGGGLM) +void LAPACK_cggglm( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* D, + lapack_complex_float* X, + lapack_complex_float* Y, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dggglm LAPACK_GLOBAL(dggglm,DGGGLM) +void LAPACK_dggglm( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* D, + double* X, + double* Y, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sggglm LAPACK_GLOBAL(sggglm,SGGGLM) +void LAPACK_sggglm( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* D, + float* X, + float* Y, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zggglm LAPACK_GLOBAL(zggglm,ZGGGLM) +void LAPACK_zggglm( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* D, + lapack_complex_double* X, + lapack_complex_double* Y, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgghd3 LAPACK_GLOBAL(cgghd3,CGGHD3) +void LAPACK_cgghd3( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgghd3 LAPACK_GLOBAL(dgghd3,DGGHD3) +void LAPACK_dgghd3( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* Q, lapack_int const* ldq, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgghd3 LAPACK_GLOBAL(sgghd3,SGGHD3) +void LAPACK_sgghd3( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* Q, lapack_int const* ldq, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgghd3 LAPACK_GLOBAL(zgghd3,ZGGHD3) +void LAPACK_zgghd3( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd,CGGHRD) +void LAPACK_cgghrd( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_int* info ); + +#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd,DGGHRD) +void LAPACK_dgghrd( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* Q, lapack_int const* ldq, + double* Z, lapack_int const* ldz, + lapack_int* info ); + +#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd,SGGHRD) +void LAPACK_sgghrd( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* Q, lapack_int const* ldq, + float* Z, lapack_int const* ldz, + lapack_int* info ); + +#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd,ZGGHRD) +void LAPACK_zgghrd( + char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_int* info ); + +#define LAPACK_cgglse LAPACK_GLOBAL(cgglse,CGGLSE) +void LAPACK_cgglse( + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* C, + lapack_complex_float* D, + lapack_complex_float* X, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgglse LAPACK_GLOBAL(dgglse,DGGLSE) +void LAPACK_dgglse( + lapack_int const* m, lapack_int const* n, lapack_int const* p, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* C, + double* D, + double* X, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgglse LAPACK_GLOBAL(sgglse,SGGLSE) +void LAPACK_sgglse( + lapack_int const* m, lapack_int const* n, lapack_int const* p, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* C, + float* D, + float* X, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgglse LAPACK_GLOBAL(zgglse,ZGGLSE) +void LAPACK_zgglse( + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* C, + lapack_complex_double* D, + lapack_complex_double* X, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf,CGGQRF) +void LAPACK_cggqrf( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* taua, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* taub, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf,DGGQRF) +void LAPACK_dggqrf( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + double* A, lapack_int const* lda, + double* taua, + double* B, lapack_int const* ldb, + double* taub, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf,SGGQRF) +void LAPACK_sggqrf( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + float* A, lapack_int const* lda, + float* taua, + float* B, lapack_int const* ldb, + float* taub, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf,ZGGQRF) +void LAPACK_zggqrf( + lapack_int const* n, lapack_int const* m, lapack_int const* p, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* taua, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* taub, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf,CGGRQF) +void LAPACK_cggrqf( + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* taua, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* taub, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf,DGGRQF) +void LAPACK_dggrqf( + lapack_int const* m, lapack_int const* p, lapack_int const* n, + double* A, lapack_int const* lda, + double* taua, + double* B, lapack_int const* ldb, + double* taub, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf,SGGRQF) +void LAPACK_sggrqf( + lapack_int const* m, lapack_int const* p, lapack_int const* n, + float* A, lapack_int const* lda, + float* taua, + float* B, lapack_int const* ldb, + float* taub, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf,ZGGRQF) +void LAPACK_zggrqf( + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* taua, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* taub, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) +lapack_int LAPACKE_sggsvd( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int n, lapack_int p, + lapack_int* k, lapack_int* l, float* a, + lapack_int lda, float* b, lapack_int ldb, + float* alpha, float* beta, float* u, lapack_int ldu, + float* v, lapack_int ldv, float* q, lapack_int ldq, + lapack_int* iwork ); + +#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) +lapack_int LAPACKE_dggsvd( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int n, lapack_int p, + lapack_int* k, lapack_int* l, double* a, + lapack_int lda, double* b, lapack_int ldb, + double* alpha, double* beta, double* u, + lapack_int ldu, double* v, lapack_int ldv, double* q, + lapack_int ldq, lapack_int* iwork ); + +#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) +lapack_int LAPACKE_cggsvd( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int n, lapack_int p, + lapack_int* k, lapack_int* l, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* b, lapack_int ldb, + float* alpha, float* beta, lapack_complex_float* u, + lapack_int ldu, lapack_complex_float* v, + lapack_int ldv, lapack_complex_float* q, + lapack_int ldq, lapack_int* iwork ); + +#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) +lapack_int LAPACKE_zggsvd( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int n, lapack_int p, + lapack_int* k, lapack_int* l, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* b, lapack_int ldb, + double* alpha, double* beta, + lapack_complex_double* u, lapack_int ldu, + lapack_complex_double* v, lapack_int ldv, + lapack_complex_double* q, lapack_int ldq, + lapack_int* iwork ); + +#define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) +void LAPACK_cggsvd3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float* alpha, + float* beta, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* V, lapack_int const* ldv, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dggsvd3 LAPACK_GLOBAL(dggsvd3,DGGSVD3) +void LAPACK_dggsvd3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* alpha, + double* beta, + double* U, lapack_int const* ldu, + double* V, lapack_int const* ldv, + double* Q, lapack_int const* ldq, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sggsvd3 LAPACK_GLOBAL(sggsvd3,SGGSVD3) +void LAPACK_sggsvd3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* alpha, + float* beta, + float* U, lapack_int const* ldu, + float* V, lapack_int const* ldv, + float* Q, lapack_int const* ldq, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zggsvd3 LAPACK_GLOBAL(zggsvd3,ZGGSVD3) +void LAPACK_zggsvd3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double* alpha, + double* beta, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* V, lapack_int const* ldv, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) +lapack_int LAPACKE_sggsvp( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int p, lapack_int n, float* a, + lapack_int lda, float* b, lapack_int ldb, float tola, + float tolb, lapack_int* k, lapack_int* l, float* u, + lapack_int ldu, float* v, lapack_int ldv, float* q, + lapack_int ldq ); + +#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) +lapack_int LAPACKE_dggsvp( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int p, lapack_int n, double* a, + lapack_int lda, double* b, lapack_int ldb, + double tola, double tolb, lapack_int* k, + lapack_int* l, double* u, lapack_int ldu, double* v, + lapack_int ldv, double* q, lapack_int ldq ); + +#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) +lapack_int LAPACKE_cggsvp( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int p, lapack_int n, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* b, lapack_int ldb, float tola, + float tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* u, lapack_int ldu, + lapack_complex_float* v, lapack_int ldv, + lapack_complex_float* q, lapack_int ldq ); + +#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) +lapack_int LAPACKE_zggsvp( int matrix_layout, char jobu, char jobv, char jobq, + lapack_int m, lapack_int p, lapack_int n, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* b, lapack_int ldb, + double tola, double tolb, lapack_int* k, + lapack_int* l, lapack_complex_double* u, + lapack_int ldu, lapack_complex_double* v, + lapack_int ldv, lapack_complex_double* q, + lapack_int ldq ); + +#define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) +void LAPACK_cggsvp3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float const* tola, + float const* tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* V, lapack_int const* ldv, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_int* iwork, + float* rwork, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dggsvp3 LAPACK_GLOBAL(dggsvp3,DGGSVP3) +void LAPACK_dggsvp3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double const* tola, + double const* tolb, lapack_int* k, lapack_int* l, + double* U, lapack_int const* ldu, + double* V, lapack_int const* ldv, + double* Q, lapack_int const* ldq, + lapack_int* iwork, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sggsvp3 LAPACK_GLOBAL(sggsvp3,SGGSVP3) +void LAPACK_sggsvp3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float const* tola, + float const* tolb, lapack_int* k, lapack_int* l, + float* U, lapack_int const* ldu, + float* V, lapack_int const* ldv, + float* Q, lapack_int const* ldq, + lapack_int* iwork, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zggsvp3 LAPACK_GLOBAL(zggsvp3,ZGGSVP3) +void LAPACK_zggsvp3( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double const* tola, + double const* tolb, lapack_int* k, lapack_int* l, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* V, lapack_int const* ldv, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_int* iwork, + double* rwork, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon,CGTCON) +void LAPACK_cgtcon( + char const* norm, + lapack_int const* n, + lapack_complex_float const* DL, + lapack_complex_float const* D, + lapack_complex_float const* DU, + lapack_complex_float const* DU2, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon,DGTCON) +void LAPACK_dgtcon( + char const* norm, + lapack_int const* n, + double const* DL, + double const* D, + double const* DU, + double const* DU2, lapack_int const* ipiv, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon,SGTCON) +void LAPACK_sgtcon( + char const* norm, + lapack_int const* n, + float const* DL, + float const* D, + float const* DU, + float const* DU2, lapack_int const* ipiv, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon,ZGTCON) +void LAPACK_zgtcon( + char const* norm, + lapack_int const* n, + lapack_complex_double const* DL, + lapack_complex_double const* D, + lapack_complex_double const* DU, + lapack_complex_double const* DU2, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs,CGTRFS) +void LAPACK_cgtrfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* DL, + lapack_complex_float const* D, + lapack_complex_float const* DU, + lapack_complex_float const* DLF, + lapack_complex_float const* DF, + lapack_complex_float const* DUF, + lapack_complex_float const* DU2, lapack_int const* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs,DGTRFS) +void LAPACK_dgtrfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + double const* DL, + double const* D, + double const* DU, + double const* DLF, + double const* DF, + double const* DUF, + double const* DU2, lapack_int const* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs,SGTRFS) +void LAPACK_sgtrfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + float const* DL, + float const* D, + float const* DU, + float const* DLF, + float const* DF, + float const* DUF, + float const* DU2, lapack_int const* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs,ZGTRFS) +void LAPACK_zgtrfs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* DL, + lapack_complex_double const* D, + lapack_complex_double const* DU, + lapack_complex_double const* DLF, + lapack_complex_double const* DF, + lapack_complex_double const* DUF, + lapack_complex_double const* DU2, lapack_int const* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv,CGTSV) +void LAPACK_cgtsv( + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* DL, + lapack_complex_float* D, + lapack_complex_float* DU, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv,DGTSV) +void LAPACK_dgtsv( + lapack_int const* n, lapack_int const* nrhs, + double* DL, + double* D, + double* DU, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv,SGTSV) +void LAPACK_sgtsv( + lapack_int const* n, lapack_int const* nrhs, + float* DL, + float* D, + float* DU, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv,ZGTSV) +void LAPACK_zgtsv( + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* DL, + lapack_complex_double* D, + lapack_complex_double* DU, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx,CGTSVX) +void LAPACK_cgtsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* DL, + lapack_complex_float const* D, + lapack_complex_float const* DU, + lapack_complex_float* DLF, + lapack_complex_float* DF, + lapack_complex_float* DUF, + lapack_complex_float* DU2, lapack_int* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx,DGTSVX) +void LAPACK_dgtsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + double const* DL, + double const* D, + double const* DU, + double* DLF, + double* DF, + double* DUF, + double* DU2, lapack_int* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx,SGTSVX) +void LAPACK_sgtsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + float const* DL, + float const* D, + float const* DU, + float* DLF, + float* DF, + float* DUF, + float* DU2, lapack_int* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx,ZGTSVX) +void LAPACK_zgtsvx( + char const* fact, char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* DL, + lapack_complex_double const* D, + lapack_complex_double const* DU, + lapack_complex_double* DLF, + lapack_complex_double* DF, + lapack_complex_double* DUF, + lapack_complex_double* DU2, lapack_int* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf,CGTTRF) +void LAPACK_cgttrf( + lapack_int const* n, + lapack_complex_float* DL, + lapack_complex_float* D, + lapack_complex_float* DU, + lapack_complex_float* DU2, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf,DGTTRF) +void LAPACK_dgttrf( + lapack_int const* n, + double* DL, + double* D, + double* DU, + double* DU2, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf,SGTTRF) +void LAPACK_sgttrf( + lapack_int const* n, + float* DL, + float* D, + float* DU, + float* DU2, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf,ZGTTRF) +void LAPACK_zgttrf( + lapack_int const* n, + lapack_complex_double* DL, + lapack_complex_double* D, + lapack_complex_double* DU, + lapack_complex_double* DU2, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs,CGTTRS) +void LAPACK_cgttrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* DL, + lapack_complex_float const* D, + lapack_complex_float const* DU, + lapack_complex_float const* DU2, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs,DGTTRS) +void LAPACK_dgttrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + double const* DL, + double const* D, + double const* DU, + double const* DU2, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs,SGTTRS) +void LAPACK_sgttrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + float const* DL, + float const* D, + float const* DU, + float const* DU2, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs,ZGTTRS) +void LAPACK_zgttrs( + char const* trans, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* DL, + lapack_complex_double const* D, + lapack_complex_double const* DU, + lapack_complex_double const* DU2, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chbev LAPACK_GLOBAL(chbev,CHBEV) +void LAPACK_chbev( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhbev LAPACK_GLOBAL(zhbev,ZHBEV) +void LAPACK_zhbev( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chbev_2stage LAPACK_GLOBAL(chbev_2stage,CHBEV_2STAGE) +void LAPACK_chbev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhbev_2stage LAPACK_GLOBAL(zhbev_2stage,ZHBEV_2STAGE) +void LAPACK_zhbev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_chbevd LAPACK_GLOBAL(chbevd,CHBEVD) +void LAPACK_chbevd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd,ZHBEVD) +void LAPACK_zhbevd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_chbevd_2stage LAPACK_GLOBAL(chbevd_2stage,CHBEVD_2STAGE) +void LAPACK_chbevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zhbevd_2stage LAPACK_GLOBAL(zhbevd_2stage,ZHBEVD_2STAGE) +void LAPACK_zhbevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_chbevx LAPACK_GLOBAL(chbevx,CHBEVX) +void LAPACK_chbevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* Q, lapack_int const* ldq, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx,ZHBEVX) +void LAPACK_zhbevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* Q, lapack_int const* ldq, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_chbevx_2stage LAPACK_GLOBAL(chbevx_2stage,CHBEVX_2STAGE) +void LAPACK_chbevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* Q, lapack_int const* ldq, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zhbevx_2stage LAPACK_GLOBAL(zhbevx_2stage,ZHBEVX_2STAGE) +void LAPACK_zhbevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* Q, lapack_int const* ldq, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_chbgst LAPACK_GLOBAL(chbgst,CHBGST) +void LAPACK_chbgst( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float const* BB, lapack_int const* ldbb, + lapack_complex_float* X, lapack_int const* ldx, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst,ZHBGST) +void LAPACK_zhbgst( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double const* BB, lapack_int const* ldbb, + lapack_complex_double* X, lapack_int const* ldx, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chbgv LAPACK_GLOBAL(chbgv,CHBGV) +void LAPACK_chbgv( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* BB, lapack_int const* ldbb, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv,ZHBGV) +void LAPACK_zhbgv( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* BB, lapack_int const* ldbb, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd,CHBGVD) +void LAPACK_chbgvd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* BB, lapack_int const* ldbb, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd,ZHBGVD) +void LAPACK_zhbgvd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* BB, lapack_int const* ldbb, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx,CHBGVX) +void LAPACK_chbgvx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* BB, lapack_int const* ldbb, + lapack_complex_float* Q, lapack_int const* ldq, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx,ZHBGVX) +void LAPACK_zhbgvx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* BB, lapack_int const* ldbb, + lapack_complex_double* Q, lapack_int const* ldq, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd,CHBTRD) +void LAPACK_chbtrd( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + float* D, + float* E, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd,ZHBTRD) +void LAPACK_zhbtrd( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + double* D, + double* E, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_checon LAPACK_GLOBAL(checon,CHECON) +void LAPACK_checon( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zhecon LAPACK_GLOBAL(zhecon,ZHECON) +void LAPACK_zhecon( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_checon_3 LAPACK_GLOBAL(checon_3,CHECON_3) +void LAPACK_checon_3( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* E, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zhecon_3 LAPACK_GLOBAL(zhecon_3,ZHECON_3) +void LAPACK_zhecon_3( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* E, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cheequb LAPACK_GLOBAL(cheequb,CHEEQUB) +void LAPACK_cheequb( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* S, + float* scond, + float* amax, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zheequb LAPACK_GLOBAL(zheequb,ZHEEQUB) +void LAPACK_zheequb( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* S, + double* scond, + double* amax, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cheev LAPACK_GLOBAL(cheev,CHEEV) +void LAPACK_cheev( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* W, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_zheev LAPACK_GLOBAL(zheev,ZHEEV) +void LAPACK_zheev( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* W, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cheev_2stage LAPACK_GLOBAL(cheev_2stage,CHEEV_2STAGE) +void LAPACK_cheev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* W, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_zheev_2stage LAPACK_GLOBAL(zheev_2stage,ZHEEV_2STAGE) +void LAPACK_zheev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* W, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_cheevd LAPACK_GLOBAL(cheevd,CHEEVD) +void LAPACK_cheevd( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* W, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zheevd LAPACK_GLOBAL(zheevd,ZHEEVD) +void LAPACK_zheevd( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* W, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_cheevd_2stage LAPACK_GLOBAL(cheevd_2stage,CHEEVD_2STAGE) +void LAPACK_cheevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* W, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zheevd_2stage LAPACK_GLOBAL(zheevd_2stage,ZHEEVD_2STAGE) +void LAPACK_zheevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* W, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_cheevr LAPACK_GLOBAL(cheevr,CHEEVR) +void LAPACK_cheevr( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zheevr LAPACK_GLOBAL(zheevr,ZHEEVR) +void LAPACK_zheevr( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_cheevr_2stage LAPACK_GLOBAL(cheevr_2stage,CHEEVR_2STAGE) +void LAPACK_cheevr_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zheevr_2stage LAPACK_GLOBAL(zheevr_2stage,ZHEEVR_2STAGE) +void LAPACK_zheevr_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_cheevx LAPACK_GLOBAL(cheevx,CHEEVX) +void LAPACK_cheevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zheevx LAPACK_GLOBAL(zheevx,ZHEEVX) +void LAPACK_zheevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_cheevx_2stage LAPACK_GLOBAL(cheevx_2stage,CHEEVX_2STAGE) +void LAPACK_cheevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zheevx_2stage LAPACK_GLOBAL(zheevx_2stage,ZHEEVX_2STAGE) +void LAPACK_zheevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_chegst LAPACK_GLOBAL(chegst,CHEGST) +void LAPACK_chegst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST) +void LAPACK_zhegst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV) +void LAPACK_chegv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float* W, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhegv LAPACK_GLOBAL(zhegv,ZHEGV) +void LAPACK_zhegv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double* W, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_chegv_2stage LAPACK_GLOBAL(chegv_2stage,CHEGV_2STAGE) +void LAPACK_chegv_2stage( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float* W, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhegv_2stage LAPACK_GLOBAL(zhegv_2stage,ZHEGV_2STAGE) +void LAPACK_zhegv_2stage( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double* W, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_chegvd LAPACK_GLOBAL(chegvd,CHEGVD) +void LAPACK_chegvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float* W, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd,ZHEGVD) +void LAPACK_zhegvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double* W, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_chegvx LAPACK_GLOBAL(chegvx,CHEGVX) +void LAPACK_chegvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx,ZHEGVX) +void LAPACK_zhegvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_cherfs LAPACK_GLOBAL(cherfs,CHERFS) +void LAPACK_cherfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zherfs LAPACK_GLOBAL(zherfs,ZHERFS) +void LAPACK_zherfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx,CHERFSX) +void LAPACK_cherfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + float* S, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx,ZHERFSX) +void LAPACK_zherfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + double* S, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chesv LAPACK_GLOBAL(chesv,CHESV) +void LAPACK_chesv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhesv LAPACK_GLOBAL(zhesv,ZHESV) +void LAPACK_zhesv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chesv_aa LAPACK_GLOBAL(chesv_aa,CHESV_AA) +void LAPACK_chesv_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhesv_aa LAPACK_GLOBAL(zhesv_aa,ZHESV_AA) +void LAPACK_zhesv_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chesv_aa_2stage LAPACK_GLOBAL(chesv_aa_2stage,CHESV_AA_2STAGE) +void LAPACK_chesv_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhesv_aa_2stage LAPACK_GLOBAL(zhesv_aa_2stage,ZHESV_AA_2STAGE) +void LAPACK_zhesv_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chesv_rk LAPACK_GLOBAL(chesv_rk,CHESV_RK) +void LAPACK_chesv_rk( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* E, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhesv_rk LAPACK_GLOBAL(zhesv_rk,ZHESV_RK) +void LAPACK_zhesv_rk( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* E, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chesv_rook LAPACK_GLOBAL(chesv_rook,CHESV_ROOK) +void LAPACK_chesv_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhesv_rook LAPACK_GLOBAL(zhesv_rook,ZHESV_ROOK) +void LAPACK_zhesv_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chesvx LAPACK_GLOBAL(chesvx,CHESVX) +void LAPACK_chesvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx,ZHESVX) +void LAPACK_zhesvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx,CHESVXX) +void LAPACK_chesvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + float* S, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx,ZHESVXX) +void LAPACK_zhesvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + double* S, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr,CHESWAPR) +void LAPACK_cheswapr( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 ); + +#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr,ZHESWAPR) +void LAPACK_zheswapr( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 ); + +#define LAPACK_chetrd LAPACK_GLOBAL(chetrd,CHETRD) +void LAPACK_chetrd( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* D, + float* E, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd,ZHETRD) +void LAPACK_zhetrd( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* D, + double* E, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrd_2stage LAPACK_GLOBAL(chetrd_2stage,CHETRD_2STAGE) +void LAPACK_chetrd_2stage( + char const* vect, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + float* D, + float* E, + lapack_complex_float* tau, + lapack_complex_float* HOUS2, lapack_int const* lhous2, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrd_2stage LAPACK_GLOBAL(zhetrd_2stage,ZHETRD_2STAGE) +void LAPACK_zhetrd_2stage( + char const* vect, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + double* D, + double* E, + lapack_complex_double* tau, + lapack_complex_double* HOUS2, lapack_int const* lhous2, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrf LAPACK_GLOBAL(chetrf,CHETRF) +void LAPACK_chetrf( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf,ZHETRF) +void LAPACK_zhetrf( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrf_aa LAPACK_GLOBAL(chetrf_aa,CHETRF_AA) +void LAPACK_chetrf_aa( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrf_aa LAPACK_GLOBAL(zhetrf_aa,ZHETRF_AA) +void LAPACK_zhetrf_aa( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrf_aa_2stage LAPACK_GLOBAL(chetrf_aa_2stage,CHETRF_AA_2STAGE) +void LAPACK_chetrf_aa_2stage( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrf_aa_2stage LAPACK_GLOBAL(zhetrf_aa_2stage,ZHETRF_AA_2STAGE) +void LAPACK_zhetrf_aa_2stage( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrf_rk LAPACK_GLOBAL(chetrf_rk,CHETRF_RK) +void LAPACK_chetrf_rk( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* E, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrf_rk LAPACK_GLOBAL(zhetrf_rk,ZHETRF_RK) +void LAPACK_zhetrf_rk( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* E, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrf_rook LAPACK_GLOBAL(chetrf_rook,CHETRF_ROOK) +void LAPACK_chetrf_rook( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrf_rook LAPACK_GLOBAL(zhetrf_rook,ZHETRF_ROOK) +void LAPACK_zhetrf_rook( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetri LAPACK_GLOBAL(chetri,CHETRI) +void LAPACK_chetri( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zhetri LAPACK_GLOBAL(zhetri,ZHETRI) +void LAPACK_zhetri( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2,CHETRI2) +void LAPACK_chetri2( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2,ZHETRI2) +void LAPACK_zhetri2( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x,CHETRI2X) +void LAPACK_chetri2x( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* work, lapack_int const* nb, + lapack_int* info ); + +#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x,ZHETRI2X) +void LAPACK_zhetri2x( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* work, lapack_int const* nb, + lapack_int* info ); + +#define LAPACK_chetri_3 LAPACK_GLOBAL(chetri_3,CHETRI_3) +void LAPACK_chetri_3( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* E, lapack_int const* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetri_3 LAPACK_GLOBAL(zhetri_3,ZHETRI_3) +void LAPACK_zhetri_3( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* E, lapack_int const* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrs LAPACK_GLOBAL(chetrs,CHETRS) +void LAPACK_chetrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs,ZHETRS) +void LAPACK_zhetrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2,CHETRS2) +void LAPACK_chetrs2( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2,ZHETRS2) +void LAPACK_zhetrs2( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_chetrs_3 LAPACK_GLOBAL(chetrs_3,CHETRS_3) +void LAPACK_chetrs_3( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* E, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zhetrs_3 LAPACK_GLOBAL(zhetrs_3,ZHETRS_3) +void LAPACK_zhetrs_3( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* E, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chetrs_aa LAPACK_GLOBAL(chetrs_aa,CHETRS_AA) +void LAPACK_chetrs_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhetrs_aa LAPACK_GLOBAL(zhetrs_aa,ZHETRS_AA) +void LAPACK_zhetrs_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_chetrs_aa_2stage LAPACK_GLOBAL(chetrs_aa_2stage,CHETRS_AA_2STAGE) +void LAPACK_chetrs_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zhetrs_aa_2stage LAPACK_GLOBAL(zhetrs_aa_2stage,ZHETRS_AA_2STAGE) +void LAPACK_zhetrs_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chetrs_rook LAPACK_GLOBAL(chetrs_rook,CHETRS_ROOK) +void LAPACK_chetrs_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zhetrs_rook LAPACK_GLOBAL(zhetrs_rook,ZHETRS_ROOK) +void LAPACK_zhetrs_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chfrk LAPACK_GLOBAL(chfrk,CHFRK) +void LAPACK_chfrk( + char const* transr, char const* uplo, char const* trans, + lapack_int const* n, lapack_int const* k, + float const* alpha, + lapack_complex_float const* A, lapack_int const* lda, + float const* beta, + lapack_complex_float* C ); + +#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk,ZHFRK) +void LAPACK_zhfrk( + char const* transr, char const* uplo, char const* trans, + lapack_int const* n, lapack_int const* k, + double const* alpha, + lapack_complex_double const* A, lapack_int const* lda, + double const* beta, + lapack_complex_double* C ); + +#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz,CHGEQZ) +void LAPACK_chgeqz( + char const* job, char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_float* H, lapack_int const* ldh, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz,DHGEQZ) +void LAPACK_dhgeqz( + char const* job, char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double* H, lapack_int const* ldh, + double* T, lapack_int const* ldt, + double* alphar, + double* alphai, + double* beta, + double* Q, lapack_int const* ldq, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz,SHGEQZ) +void LAPACK_shgeqz( + char const* job, char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float* H, lapack_int const* ldh, + float* T, lapack_int const* ldt, + float* alphar, + float* alphai, + float* beta, + float* Q, lapack_int const* ldq, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz,ZHGEQZ) +void LAPACK_zhgeqz( + char const* job, char const* compq, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_double* H, lapack_int const* ldh, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_chpcon LAPACK_GLOBAL(chpcon,CHPCON) +void LAPACK_chpcon( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon,ZHPCON) +void LAPACK_zhpcon( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_chpev LAPACK_GLOBAL(chpev,CHPEV) +void LAPACK_chpev( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhpev LAPACK_GLOBAL(zhpev,ZHPEV) +void LAPACK_zhpev( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chpevd LAPACK_GLOBAL(chpevd,CHPEVD) +void LAPACK_chpevd( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd,ZHPEVD) +void LAPACK_zhpevd( + char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_chpevx LAPACK_GLOBAL(chpevx,CHPEVX) +void LAPACK_chpevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx,ZHPEVX) +void LAPACK_zhpevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_chpgst LAPACK_GLOBAL(chpgst,CHPGST) +void LAPACK_chpgst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + lapack_complex_float const* BP, + lapack_int* info ); + +#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst,ZHPGST) +void LAPACK_zhpgst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + lapack_complex_double const* BP, + lapack_int* info ); + +#define LAPACK_chpgv LAPACK_GLOBAL(chpgv,CHPGV) +void LAPACK_chpgv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + lapack_complex_float* BP, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv,ZHPGV) +void LAPACK_zhpgv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + lapack_complex_double* BP, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd,CHPGVD) +void LAPACK_chpgvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + lapack_complex_float* BP, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd,ZHPGVD) +void LAPACK_zhpgvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + lapack_complex_double* BP, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx,CHPGVX) +void LAPACK_chpgvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + lapack_complex_float* BP, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, + float* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx,ZHPGVX) +void LAPACK_zhpgvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + lapack_complex_double* BP, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, + double* rwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_chprfs LAPACK_GLOBAL(chprfs,CHPRFS) +void LAPACK_chprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float const* AFP, lapack_int const* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs,ZHPRFS) +void LAPACK_zhprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double const* AFP, lapack_int const* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chpsv LAPACK_GLOBAL(chpsv,CHPSV) +void LAPACK_chpsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* AP, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv,ZHPSV) +void LAPACK_zhpsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* AP, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx,CHPSVX) +void LAPACK_chpsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float* AFP, lapack_int* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx,ZHPSVX) +void LAPACK_zhpsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double* AFP, lapack_int* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_chptrd LAPACK_GLOBAL(chptrd,CHPTRD) +void LAPACK_chptrd( + char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + float* D, + float* E, + lapack_complex_float* tau, + lapack_int* info ); + +#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd,ZHPTRD) +void LAPACK_zhptrd( + char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + double* D, + double* E, + lapack_complex_double* tau, + lapack_int* info ); + +#define LAPACK_chptrf LAPACK_GLOBAL(chptrf,CHPTRF) +void LAPACK_chptrf( + char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf,ZHPTRF) +void LAPACK_zhptrf( + char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_chptri LAPACK_GLOBAL(chptri,CHPTRI) +void LAPACK_chptri( + char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, lapack_int const* ipiv, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zhptri LAPACK_GLOBAL(zhptri,ZHPTRI) +void LAPACK_zhptri( + char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, lapack_int const* ipiv, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_chptrs LAPACK_GLOBAL(chptrs,CHPTRS) +void LAPACK_chptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs,ZHPTRS) +void LAPACK_zhptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_chsein LAPACK_GLOBAL(chsein,CHSEIN) +void LAPACK_chsein( + char const* side, char const* eigsrc, char const* initv, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_float const* H, lapack_int const* ldh, + lapack_complex_float* W, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_float* work, + float* rwork, lapack_int* IFAILL, lapack_int* IFAILR, + lapack_int* info ); + +#define LAPACK_dhsein LAPACK_GLOBAL(dhsein,DHSEIN) +void LAPACK_dhsein( + char const* side, char const* eigsrc, char const* initv, + lapack_logical* select, + lapack_int const* n, + double const* H, lapack_int const* ldh, + double* WR, + double const* WI, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + double* work, lapack_int* IFAILL, lapack_int* IFAILR, + lapack_int* info ); + +#define LAPACK_shsein LAPACK_GLOBAL(shsein,SHSEIN) +void LAPACK_shsein( + char const* side, char const* eigsrc, char const* initv, + lapack_logical* select, + lapack_int const* n, + float const* H, lapack_int const* ldh, + float* WR, + float const* WI, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + float* work, lapack_int* IFAILL, lapack_int* IFAILR, + lapack_int* info ); + +#define LAPACK_zhsein LAPACK_GLOBAL(zhsein,ZHSEIN) +void LAPACK_zhsein( + char const* side, char const* eigsrc, char const* initv, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_double const* H, lapack_int const* ldh, + lapack_complex_double* W, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_double* work, + double* rwork, lapack_int* IFAILL, lapack_int* IFAILR, + lapack_int* info ); + +#define LAPACK_chseqr LAPACK_GLOBAL(chseqr,CHSEQR) +void LAPACK_chseqr( + char const* job, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_float* H, lapack_int const* ldh, + lapack_complex_float* W, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr,DHSEQR) +void LAPACK_dhseqr( + char const* job, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double* H, lapack_int const* ldh, + double* WR, + double* WI, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_shseqr LAPACK_GLOBAL(shseqr,SHSEQR) +void LAPACK_shseqr( + char const* job, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float* H, lapack_int const* ldh, + float* WR, + float* WI, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr,ZHSEQR) +void LAPACK_zhseqr( + char const* job, char const* compz, + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_double* H, lapack_int const* ldh, + lapack_complex_double* W, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_clacgv LAPACK_GLOBAL(clacgv,CLACGV) +void LAPACK_clacgv( + lapack_int const* n, + lapack_complex_float* X, lapack_int const* incx ); + +#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv,ZLACGV) +void LAPACK_zlacgv( + lapack_int const* n, + lapack_complex_double* X, lapack_int const* incx ); + +#define LAPACK_clacn2 LAPACK_GLOBAL(clacn2,CLACN2) +void LAPACK_clacn2( + lapack_int const* n, + lapack_complex_float* V, + lapack_complex_float* X, + float* est, lapack_int* kase, lapack_int* ISAVE ); + +#define LAPACK_dlacn2 LAPACK_GLOBAL(dlacn2,DLACN2) +void LAPACK_dlacn2( + lapack_int const* n, + double* V, + double* X, lapack_int* ISGN, + double* est, lapack_int* kase, lapack_int* ISAVE ); + +#define LAPACK_slacn2 LAPACK_GLOBAL(slacn2,SLACN2) +void LAPACK_slacn2( + lapack_int const* n, + float* V, + float* X, lapack_int* ISGN, + float* est, lapack_int* kase, lapack_int* ISAVE ); + +#define LAPACK_zlacn2 LAPACK_GLOBAL(zlacn2,ZLACN2) +void LAPACK_zlacn2( + lapack_int const* n, + lapack_complex_double* V, + lapack_complex_double* X, + double* est, lapack_int* kase, lapack_int* ISAVE ); + +#define LAPACK_clacp2 LAPACK_GLOBAL(clacp2,CLACP2) +void LAPACK_clacp2( + char const* uplo, + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb ); + +#define LAPACK_zlacp2 LAPACK_GLOBAL(zlacp2,ZLACP2) +void LAPACK_zlacp2( + char const* uplo, + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb ); + +#define LAPACK_clacpy LAPACK_GLOBAL(clacpy,CLACPY) +void LAPACK_clacpy( + char const* uplo, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb ); + +#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy,DLACPY) +void LAPACK_dlacpy( + char const* uplo, + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double* B, lapack_int const* ldb ); + +#define LAPACK_slacpy LAPACK_GLOBAL(slacpy,SLACPY) +void LAPACK_slacpy( + char const* uplo, + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float* B, lapack_int const* ldb ); + +#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy,ZLACPY) +void LAPACK_zlacpy( + char const* uplo, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb ); + +#define LAPACK_clacrm LAPACK_GLOBAL(clacrm,CLACRM) +void LAPACK_clacrm( + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, + float* rwork ); + +#define LAPACK_zlacrm LAPACK_GLOBAL(zlacrm,ZLACRM) +void LAPACK_zlacrm( + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, + double* rwork ); + +#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c,ZLAG2C) +void LAPACK_zlag2c( + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_float* SA, lapack_int const* ldsa, + lapack_int* info ); + +#define LAPACK_slag2d LAPACK_GLOBAL(slag2d,SLAG2D) +void LAPACK_slag2d( + lapack_int const* m, lapack_int const* n, + float const* SA, lapack_int const* ldsa, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s,DLAG2S) +void LAPACK_dlag2s( + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + float* SA, lapack_int const* ldsa, + lapack_int* info ); + +#define LAPACK_clag2z LAPACK_GLOBAL(clag2z,CLAG2Z) +void LAPACK_clag2z( + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* SA, lapack_int const* ldsa, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_clagge LAPACK_GLOBAL(clagge,CLAGGE) +void LAPACK_clagge( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + float const* D, + lapack_complex_float* A, lapack_int const* lda, lapack_int* iseed, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dlagge LAPACK_GLOBAL(dlagge,DLAGGE) +void LAPACK_dlagge( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + double const* D, + double* A, lapack_int const* lda, lapack_int* iseed, + double* work, + lapack_int* info ); + +#define LAPACK_slagge LAPACK_GLOBAL(slagge,SLAGGE) +void LAPACK_slagge( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + float const* D, + float* A, lapack_int const* lda, lapack_int* iseed, + float* work, + lapack_int* info ); + +#define LAPACK_zlagge LAPACK_GLOBAL(zlagge,ZLAGGE) +void LAPACK_zlagge( + lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + double const* D, + lapack_complex_double* A, lapack_int const* lda, lapack_int* iseed, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_claghe LAPACK_GLOBAL(claghe,CLAGHE) +void LAPACK_claghe( + lapack_int const* n, lapack_int const* k, + float const* D, + lapack_complex_float* A, lapack_int const* lda, lapack_int* iseed, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe,ZLAGHE) +void LAPACK_zlaghe( + lapack_int const* n, lapack_int const* k, + double const* D, + lapack_complex_double* A, lapack_int const* lda, lapack_int* iseed, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_clagsy LAPACK_GLOBAL(clagsy,CLAGSY) +void LAPACK_clagsy( + lapack_int const* n, lapack_int const* k, + float const* D, + lapack_complex_float* A, lapack_int const* lda, lapack_int* iseed, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy,DLAGSY) +void LAPACK_dlagsy( + lapack_int const* n, lapack_int const* k, + double const* D, + double* A, lapack_int const* lda, lapack_int* iseed, + double* work, + lapack_int* info ); + +#define LAPACK_slagsy LAPACK_GLOBAL(slagsy,SLAGSY) +void LAPACK_slagsy( + lapack_int const* n, lapack_int const* k, + float const* D, + float* A, lapack_int const* lda, lapack_int* iseed, + float* work, + lapack_int* info ); + +#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy,ZLAGSY) +void LAPACK_zlagsy( + lapack_int const* n, lapack_int const* k, + double const* D, + lapack_complex_double* A, lapack_int const* lda, lapack_int* iseed, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_dlamch LAPACK_GLOBAL(dlamch,DLAMCH) +double LAPACK_dlamch( + char const* cmach ); + +#define LAPACK_slamch LAPACK_GLOBAL(slamch,SLAMCH) +lapack_float_return LAPACK_slamch( + char const* cmach ); + +#define LAPACK_clangb LAPACK_GLOBAL(clangb,CLANGB) +lapack_float_return LAPACK_clangb( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_float const* AB, lapack_int const* ldab, + float* work ); + +#define LAPACK_dlangb LAPACK_GLOBAL(dlangb,DLANGB) +double LAPACK_dlangb( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + double const* AB, lapack_int const* ldab, + double* work ); + +#define LAPACK_slangb LAPACK_GLOBAL(slangb,SLANGB) +lapack_float_return LAPACK_slangb( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + float const* AB, lapack_int const* ldab, + float* work ); + +#define LAPACK_zlangb LAPACK_GLOBAL(zlangb,ZLANGB) +double LAPACK_zlangb( + char const* norm, + lapack_int const* n, lapack_int const* kl, lapack_int const* ku, + lapack_complex_double const* AB, lapack_int const* ldab, + double* work ); + +#define LAPACK_clange LAPACK_GLOBAL(clange,CLANGE) +lapack_float_return LAPACK_clange( + char const* norm, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_dlange LAPACK_GLOBAL(dlange,DLANGE) +double LAPACK_dlange( + char const* norm, + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_slange LAPACK_GLOBAL(slange,SLANGE) +lapack_float_return LAPACK_slange( + char const* norm, + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_zlange LAPACK_GLOBAL(zlange,ZLANGE) +double LAPACK_zlange( + char const* norm, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_clangt LAPACK_GLOBAL(clangt,CLANGT) +lapack_float_return LAPACK_clangt( + char const* norm, + lapack_int const* n, + lapack_complex_float const* DL, + lapack_complex_float const* D, + lapack_complex_float const* DU ); + +#define LAPACK_dlangt LAPACK_GLOBAL(dlangt,DLANGT) +double LAPACK_dlangt( + char const* norm, + lapack_int const* n, + double const* DL, + double const* D, + double const* DU ); + +#define LAPACK_slangt LAPACK_GLOBAL(slangt,SLANGT) +lapack_float_return LAPACK_slangt( + char const* norm, + lapack_int const* n, + float const* DL, + float const* D, + float const* DU ); + +#define LAPACK_zlangt LAPACK_GLOBAL(zlangt,ZLANGT) +double LAPACK_zlangt( + char const* norm, + lapack_int const* n, + lapack_complex_double const* DL, + lapack_complex_double const* D, + lapack_complex_double const* DU ); + +#define LAPACK_clanhb LAPACK_GLOBAL(clanhb,CLANHB) +lapack_float_return LAPACK_clanhb( + char const* norm, char const* uplo, + lapack_int const* n, lapack_int const* k, + lapack_complex_float const* AB, lapack_int const* ldab, + float* work ); + +#define LAPACK_zlanhb LAPACK_GLOBAL(zlanhb,ZLANHB) +double LAPACK_zlanhb( + char const* norm, char const* uplo, + lapack_int const* n, lapack_int const* k, + lapack_complex_double const* AB, lapack_int const* ldab, + double* work ); + +#define LAPACK_clanhe LAPACK_GLOBAL(clanhe,CLANHE) +lapack_float_return LAPACK_clanhe( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe,ZLANHE) +double LAPACK_zlanhe( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_clanhp LAPACK_GLOBAL(clanhp,CLANHP) +lapack_float_return LAPACK_clanhp( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, + float* work ); + +#define LAPACK_zlanhp LAPACK_GLOBAL(zlanhp,ZLANHP) +double LAPACK_zlanhp( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, + double* work ); + +#define LAPACK_clanhs LAPACK_GLOBAL(clanhs,CLANHS) +lapack_float_return LAPACK_clanhs( + char const* norm, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_dlanhs LAPACK_GLOBAL(dlanhs,DLANHS) +double LAPACK_dlanhs( + char const* norm, + lapack_int const* n, + double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_slanhs LAPACK_GLOBAL(slanhs,SLANHS) +lapack_float_return LAPACK_slanhs( + char const* norm, + lapack_int const* n, + float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_zlanhs LAPACK_GLOBAL(zlanhs,ZLANHS) +double LAPACK_zlanhs( + char const* norm, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_clanht LAPACK_GLOBAL(clanht,CLANHT) +lapack_float_return LAPACK_clanht( + char const* norm, + lapack_int const* n, + float const* D, + lapack_complex_float const* E ); + +#define LAPACK_zlanht LAPACK_GLOBAL(zlanht,ZLANHT) +double LAPACK_zlanht( + char const* norm, + lapack_int const* n, + double const* D, + lapack_complex_double const* E ); + +#define LAPACK_clansb LAPACK_GLOBAL(clansb,CLANSB) +lapack_float_return LAPACK_clansb( + char const* norm, char const* uplo, + lapack_int const* n, lapack_int const* k, + lapack_complex_float const* AB, lapack_int const* ldab, + float* work ); + +#define LAPACK_dlansb LAPACK_GLOBAL(dlansb,DLANSB) +double LAPACK_dlansb( + char const* norm, char const* uplo, + lapack_int const* n, lapack_int const* k, + double const* AB, lapack_int const* ldab, + double* work ); + +#define LAPACK_slansb LAPACK_GLOBAL(slansb,SLANSB) +lapack_float_return LAPACK_slansb( + char const* norm, char const* uplo, + lapack_int const* n, lapack_int const* k, + float const* AB, lapack_int const* ldab, + float* work ); + +#define LAPACK_zlansb LAPACK_GLOBAL(zlansb,ZLANSB) +double LAPACK_zlansb( + char const* norm, char const* uplo, + lapack_int const* n, lapack_int const* k, + lapack_complex_double const* AB, lapack_int const* ldab, + double* work ); + +#define LAPACK_clansp LAPACK_GLOBAL(clansp,CLANSP) +lapack_float_return LAPACK_clansp( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, + float* work ); + +#define LAPACK_dlansp LAPACK_GLOBAL(dlansp,DLANSP) +double LAPACK_dlansp( + char const* norm, char const* uplo, + lapack_int const* n, + double const* AP, + double* work ); + +#define LAPACK_slansp LAPACK_GLOBAL(slansp,SLANSP) +lapack_float_return LAPACK_slansp( + char const* norm, char const* uplo, + lapack_int const* n, + float const* AP, + float* work ); + +#define LAPACK_zlansp LAPACK_GLOBAL(zlansp,ZLANSP) +double LAPACK_zlansp( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, + double* work ); + +#define LAPACK_dlanst LAPACK_GLOBAL(dlanst,DLANST) +double LAPACK_dlanst( + char const* norm, + lapack_int const* n, + double const* D, + double const* E ); + +#define LAPACK_slanst LAPACK_GLOBAL(slanst,SLANST) +lapack_float_return LAPACK_slanst( + char const* norm, + lapack_int const* n, + float const* D, + float const* E ); + +#define LAPACK_clansy LAPACK_GLOBAL(clansy,CLANSY) +lapack_float_return LAPACK_clansy( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_dlansy LAPACK_GLOBAL(dlansy,DLANSY) +double LAPACK_dlansy( + char const* norm, char const* uplo, + lapack_int const* n, + double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_slansy LAPACK_GLOBAL(slansy,SLANSY) +lapack_float_return LAPACK_slansy( + char const* norm, char const* uplo, + lapack_int const* n, + float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_zlansy LAPACK_GLOBAL(zlansy,ZLANSY) +double LAPACK_zlansy( + char const* norm, char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_clantb LAPACK_GLOBAL(clantb,CLANTB) +lapack_float_return LAPACK_clantb( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* k, + lapack_complex_float const* AB, lapack_int const* ldab, + float* work ); + +#define LAPACK_dlantb LAPACK_GLOBAL(dlantb,DLANTB) +double LAPACK_dlantb( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* k, + double const* AB, lapack_int const* ldab, + double* work ); + +#define LAPACK_slantb LAPACK_GLOBAL(slantb,SLANTB) +lapack_float_return LAPACK_slantb( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* k, + float const* AB, lapack_int const* ldab, + float* work ); + +#define LAPACK_zlantb LAPACK_GLOBAL(zlantb,ZLANTB) +double LAPACK_zlantb( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* k, + lapack_complex_double const* AB, lapack_int const* ldab, + double* work ); + +#define LAPACK_clantp LAPACK_GLOBAL(clantp,CLANTP) +lapack_float_return LAPACK_clantp( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_float const* AP, + float* work ); + +#define LAPACK_dlantp LAPACK_GLOBAL(dlantp,DLANTP) +double LAPACK_dlantp( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + double const* AP, + double* work ); + +#define LAPACK_slantp LAPACK_GLOBAL(slantp,SLANTP) +lapack_float_return LAPACK_slantp( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + float const* AP, + float* work ); + +#define LAPACK_zlantp LAPACK_GLOBAL(zlantp,ZLANTP) +double LAPACK_zlantp( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_double const* AP, + double* work ); + +#define LAPACK_clantr LAPACK_GLOBAL(clantr,CLANTR) +lapack_float_return LAPACK_clantr( + char const* norm, char const* uplo, char const* diag, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_dlantr LAPACK_GLOBAL(dlantr,DLANTR) +double LAPACK_dlantr( + char const* norm, char const* uplo, char const* diag, + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_slantr LAPACK_GLOBAL(slantr,SLANTR) +lapack_float_return LAPACK_slantr( + char const* norm, char const* uplo, char const* diag, + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float* work ); + +#define LAPACK_zlantr LAPACK_GLOBAL(zlantr,ZLANTR) +double LAPACK_zlantr( + char const* norm, char const* uplo, char const* diag, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* work ); + +#define LAPACK_clapmr LAPACK_GLOBAL(clapmr,CLAPMR) +void LAPACK_clapmr( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + lapack_complex_float* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr,DLAPMR) +void LAPACK_dlapmr( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + double* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_slapmr LAPACK_GLOBAL(slapmr,SLAPMR) +void LAPACK_slapmr( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + float* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr,ZLAPMR) +void LAPACK_zlapmr( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + lapack_complex_double* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_clapmt LAPACK_GLOBAL(clapmt,CLAPMT) +void LAPACK_clapmt( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + lapack_complex_float* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_dlapmt LAPACK_GLOBAL(dlapmt,DLAPMT) +void LAPACK_dlapmt( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + double* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_slapmt LAPACK_GLOBAL(slapmt,SLAPMT) +void LAPACK_slapmt( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + float* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_zlapmt LAPACK_GLOBAL(zlapmt,ZLAPMT) +void LAPACK_zlapmt( + lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n, + lapack_complex_double* X, lapack_int const* ldx, lapack_int* K ); + +#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2,DLAPY2) +double LAPACK_dlapy2( + double const* x, + double const* y ); + +#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2,SLAPY2) +lapack_float_return LAPACK_slapy2( + float const* x, + float const* y ); + +#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3,DLAPY3) +double LAPACK_dlapy3( + double const* x, + double const* y, + double const* z ); + +#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3,SLAPY3) +lapack_float_return LAPACK_slapy3( + float const* x, + float const* y, + float const* z ); + +#define LAPACK_clarcm LAPACK_GLOBAL(clarcm,CLARCM) +void LAPACK_clarcm( + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, + float* rwork ); + +#define LAPACK_zlarcm LAPACK_GLOBAL(zlarcm,ZLARCM) +void LAPACK_zlarcm( + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, + double* rwork ); + +#define LAPACK_clarf LAPACK_GLOBAL(clarf,CLARF) +void LAPACK_clarf( + char const* side, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* V, lapack_int const* incv, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work ); + +#define LAPACK_dlarf LAPACK_GLOBAL(dlarf,DLARF) +void LAPACK_dlarf( + char const* side, + lapack_int const* m, lapack_int const* n, + double const* V, lapack_int const* incv, + double const* tau, + double* C, lapack_int const* ldc, + double* work ); + +#define LAPACK_slarf LAPACK_GLOBAL(slarf,SLARF) +void LAPACK_slarf( + char const* side, + lapack_int const* m, lapack_int const* n, + float const* V, lapack_int const* incv, + float const* tau, + float* C, lapack_int const* ldc, + float* work ); + +#define LAPACK_zlarf LAPACK_GLOBAL(zlarf,ZLARF) +void LAPACK_zlarf( + char const* side, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* V, lapack_int const* incv, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work ); + +#define LAPACK_clarfb LAPACK_GLOBAL(clarfb,CLARFB) +void LAPACK_clarfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* V, lapack_int const* ldv, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* ldwork ); + +#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb,DLARFB) +void LAPACK_dlarfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* V, lapack_int const* ldv, + double const* T, lapack_int const* ldt, + double* C, lapack_int const* ldc, + double* work, lapack_int const* ldwork ); + +#define LAPACK_slarfb LAPACK_GLOBAL(slarfb,SLARFB) +void LAPACK_slarfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* V, lapack_int const* ldv, + float const* T, lapack_int const* ldt, + float* C, lapack_int const* ldc, + float* work, lapack_int const* ldwork ); + +#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb,ZLARFB) +void LAPACK_zlarfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* V, lapack_int const* ldv, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* ldwork ); + +#define LAPACK_clarfg LAPACK_GLOBAL(clarfg,CLARFG) +void LAPACK_clarfg( + lapack_int const* n, + lapack_complex_float* alpha, + lapack_complex_float* X, lapack_int const* incx, + lapack_complex_float* tau ); + +#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg,DLARFG) +void LAPACK_dlarfg( + lapack_int const* n, + double* alpha, + double* X, lapack_int const* incx, + double* tau ); + +#define LAPACK_slarfg LAPACK_GLOBAL(slarfg,SLARFG) +void LAPACK_slarfg( + lapack_int const* n, + float* alpha, + float* X, lapack_int const* incx, + float* tau ); + +#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg,ZLARFG) +void LAPACK_zlarfg( + lapack_int const* n, + lapack_complex_double* alpha, + lapack_complex_double* X, lapack_int const* incx, + lapack_complex_double* tau ); + +#define LAPACK_clarft LAPACK_GLOBAL(clarft,CLARFT) +void LAPACK_clarft( + char const* direct, char const* storev, + lapack_int const* n, lapack_int const* k, + lapack_complex_float const* V, lapack_int const* ldv, + lapack_complex_float const* tau, + lapack_complex_float* T, lapack_int const* ldt ); + +#define LAPACK_dlarft LAPACK_GLOBAL(dlarft,DLARFT) +void LAPACK_dlarft( + char const* direct, char const* storev, + lapack_int const* n, lapack_int const* k, + double const* V, lapack_int const* ldv, + double const* tau, + double* T, lapack_int const* ldt ); + +#define LAPACK_slarft LAPACK_GLOBAL(slarft,SLARFT) +void LAPACK_slarft( + char const* direct, char const* storev, + lapack_int const* n, lapack_int const* k, + float const* V, lapack_int const* ldv, + float const* tau, + float* T, lapack_int const* ldt ); + +#define LAPACK_zlarft LAPACK_GLOBAL(zlarft,ZLARFT) +void LAPACK_zlarft( + char const* direct, char const* storev, + lapack_int const* n, lapack_int const* k, + lapack_complex_double const* V, lapack_int const* ldv, + lapack_complex_double const* tau, + lapack_complex_double* T, lapack_int const* ldt ); + +#define LAPACK_clarfx LAPACK_GLOBAL(clarfx,CLARFX) +void LAPACK_clarfx( + char const* side, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* V, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work ); + +#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx,DLARFX) +void LAPACK_dlarfx( + char const* side, + lapack_int const* m, lapack_int const* n, + double const* V, + double const* tau, + double* C, lapack_int const* ldc, + double* work ); + +#define LAPACK_slarfx LAPACK_GLOBAL(slarfx,SLARFX) +void LAPACK_slarfx( + char const* side, + lapack_int const* m, lapack_int const* n, + float const* V, + float const* tau, + float* C, lapack_int const* ldc, + float* work ); + +#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx,ZLARFX) +void LAPACK_zlarfx( + char const* side, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* V, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work ); + +#define LAPACK_clarnv LAPACK_GLOBAL(clarnv,CLARNV) +void LAPACK_clarnv( + lapack_int const* idist, lapack_int* iseed, lapack_int const* n, + lapack_complex_float* X ); + +#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv,DLARNV) +void LAPACK_dlarnv( + lapack_int const* idist, lapack_int* iseed, lapack_int const* n, + double* X ); + +#define LAPACK_slarnv LAPACK_GLOBAL(slarnv,SLARNV) +void LAPACK_slarnv( + lapack_int const* idist, lapack_int* iseed, lapack_int const* n, + float* X ); + +#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv,ZLARNV) +void LAPACK_zlarnv( + lapack_int const* idist, lapack_int* iseed, lapack_int const* n, + lapack_complex_double* X ); + +#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp,DLARTGP) +void LAPACK_dlartgp( + double const* f, + double const* g, + double* cs, + double* sn, + double* r ); + +#define LAPACK_slartgp LAPACK_GLOBAL(slartgp,SLARTGP) +void LAPACK_slartgp( + float const* f, + float const* g, + float* cs, + float* sn, + float* r ); + +#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs,DLARTGS) +void LAPACK_dlartgs( + double const* x, + double const* y, + double const* sigma, + double* cs, + double* sn ); + +#define LAPACK_slartgs LAPACK_GLOBAL(slartgs,SLARTGS) +void LAPACK_slartgs( + float const* x, + float const* y, + float const* sigma, + float* cs, + float* sn ); + +#define LAPACK_clascl LAPACK_GLOBAL(clascl,CLASCL) +void LAPACK_clascl( + char const* type, + lapack_int const* kl, lapack_int const* ku, + float const* cfrom, + float const* cto, lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dlascl LAPACK_GLOBAL(dlascl,DLASCL) +void LAPACK_dlascl( + char const* type, + lapack_int const* kl, lapack_int const* ku, + double const* cfrom, + double const* cto, lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_slascl LAPACK_GLOBAL(slascl,SLASCL) +void LAPACK_slascl( + char const* type, + lapack_int const* kl, lapack_int const* ku, + float const* cfrom, + float const* cto, lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_zlascl LAPACK_GLOBAL(zlascl,ZLASCL) +void LAPACK_zlascl( + char const* type, + lapack_int const* kl, lapack_int const* ku, + double const* cfrom, + double const* cto, lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_claset LAPACK_GLOBAL(claset,CLASET) +void LAPACK_claset( + char const* uplo, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* alpha, + lapack_complex_float const* beta, + lapack_complex_float* A, lapack_int const* lda ); + +#define LAPACK_dlaset LAPACK_GLOBAL(dlaset,DLASET) +void LAPACK_dlaset( + char const* uplo, + lapack_int const* m, lapack_int const* n, + double const* alpha, + double const* beta, + double* A, lapack_int const* lda ); + +#define LAPACK_slaset LAPACK_GLOBAL(slaset,SLASET) +void LAPACK_slaset( + char const* uplo, + lapack_int const* m, lapack_int const* n, + float const* alpha, + float const* beta, + float* A, lapack_int const* lda ); + +#define LAPACK_zlaset LAPACK_GLOBAL(zlaset,ZLASET) +void LAPACK_zlaset( + char const* uplo, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* alpha, + lapack_complex_double const* beta, + lapack_complex_double* A, lapack_int const* lda ); + +#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt,DLASRT) +void LAPACK_dlasrt( + char const* id, + lapack_int const* n, + double* D, + lapack_int* info ); + +#define LAPACK_slasrt LAPACK_GLOBAL(slasrt,SLASRT) +void LAPACK_slasrt( + char const* id, + lapack_int const* n, + float* D, + lapack_int* info ); + +#define LAPACK_classq LAPACK_GLOBAL(classq,CLASSQ) +void LAPACK_classq( + lapack_int const* n, + lapack_complex_float const* X, lapack_int const* incx, + float* scale, + float* sumsq ); + +#define LAPACK_dlassq LAPACK_GLOBAL(dlassq,DLASSQ) +void LAPACK_dlassq( + lapack_int const* n, + double const* X, lapack_int const* incx, + double* scale, + double* sumsq ); + +#define LAPACK_slassq LAPACK_GLOBAL(slassq,SLASSQ) +void LAPACK_slassq( + lapack_int const* n, + float const* X, lapack_int const* incx, + float* scale, + float* sumsq ); + +#define LAPACK_zlassq LAPACK_GLOBAL(zlassq,ZLASSQ) +void LAPACK_zlassq( + lapack_int const* n, + lapack_complex_double const* X, lapack_int const* incx, + double* scale, + double* sumsq ); + +#define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP) +void LAPACK_claswp( + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); + +#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP) +void LAPACK_dlaswp( + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); + +#define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP) +void LAPACK_slaswp( + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); + +#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP) +void LAPACK_zlaswp( + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); + +#define LAPACK_clatms LAPACK_GLOBAL(clatms,CLATMS) +void LAPACK_clatms( + lapack_int const* m, lapack_int const* n, char const* dist, + lapack_int* iseed, char const* sym, + float* D, + lapack_int const* mode, + float const* cond, + float const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack, + lapack_complex_float* A, + lapack_int const* lda, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dlatms LAPACK_GLOBAL(dlatms,DLATMS) +void LAPACK_dlatms( + lapack_int const* m, lapack_int const* n, char const* dist, + lapack_int* iseed, char const* sym, + double* D, + lapack_int const* mode, + double const* cond, + double const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack, + double* A, + lapack_int const* lda, + double* work, + lapack_int* info ); + +#define LAPACK_slatms LAPACK_GLOBAL(slatms,SLATMS) +void LAPACK_slatms( + lapack_int const* m, lapack_int const* n, char const* dist, + lapack_int* iseed, char const* sym, + float* D, + lapack_int const* mode, + float const* cond, + float const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack, + float* A, + lapack_int const* lda, + float* work, + lapack_int* info ); + +#define LAPACK_zlatms LAPACK_GLOBAL(zlatms,ZLATMS) +void LAPACK_zlatms( + lapack_int const* m, lapack_int const* n, char const* dist, + lapack_int* iseed, char const* sym, + double* D, + lapack_int const* mode, + double const* cond, + double const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack, + lapack_complex_double* A, + lapack_int const* lda, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_clauum LAPACK_GLOBAL(clauum,CLAUUM) +void LAPACK_clauum( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dlauum LAPACK_GLOBAL(dlauum,DLAUUM) +void LAPACK_dlauum( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_slauum LAPACK_GLOBAL(slauum,SLAUUM) +void LAPACK_slauum( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_zlauum LAPACK_GLOBAL(zlauum,ZLAUUM) +void LAPACK_zlauum( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_ilaver LAPACK_GLOBAL(ilaver,ILAVER) +void LAPACK_ilaver( + lapack_int* vers_major, lapack_int* vers_minor, lapack_int* vers_patch ); + +#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr,DOPGTR) +void LAPACK_dopgtr( + char const* uplo, + lapack_int const* n, + double const* AP, + double const* tau, + double* Q, lapack_int const* ldq, + double* work, + lapack_int* info ); + +#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr,SOPGTR) +void LAPACK_sopgtr( + char const* uplo, + lapack_int const* n, + float const* AP, + float const* tau, + float* Q, lapack_int const* ldq, + float* work, + lapack_int* info ); + +#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr,DOPMTR) +void LAPACK_dopmtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + double const* AP, + double const* tau, + double* C, lapack_int const* ldc, + double* work, + lapack_int* info ); + +#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr,SOPMTR) +void LAPACK_sopmtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + float const* AP, + float const* tau, + float* C, lapack_int const* ldc, + float* work, + lapack_int* info ); + +#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb,DORBDB) +void LAPACK_dorbdb( + char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + double* X11, lapack_int const* ldx11, + double* X12, lapack_int const* ldx12, + double* X21, lapack_int const* ldx21, + double* X22, lapack_int const* ldx22, + double* theta, + double* phi, + double* TAUP1, + double* TAUP2, + double* TAUQ1, + double* TAUQ2, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb,SORBDB) +void LAPACK_sorbdb( + char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + float* X11, lapack_int const* ldx11, + float* X12, lapack_int const* ldx12, + float* X21, lapack_int const* ldx21, + float* X22, lapack_int const* ldx22, + float* theta, + float* phi, + float* TAUP1, + float* TAUP2, + float* TAUQ1, + float* TAUQ2, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd,DORCSD) +void LAPACK_dorcsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + double* X11, lapack_int const* ldx11, + double* X12, lapack_int const* ldx12, + double* X21, lapack_int const* ldx21, + double* X22, lapack_int const* ldx22, + double* theta, + double* U1, lapack_int const* ldu1, + double* U2, lapack_int const* ldu2, + double* V1T, lapack_int const* ldv1t, + double* V2T, lapack_int const* ldv2t, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd,SORCSD) +void LAPACK_sorcsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + float* X11, lapack_int const* ldx11, + float* X12, lapack_int const* ldx12, + float* X21, lapack_int const* ldx21, + float* X22, lapack_int const* ldx22, + float* theta, + float* U1, lapack_int const* ldu1, + float* U2, lapack_int const* ldu2, + float* V1T, lapack_int const* ldv1t, + float* V2T, lapack_int const* ldv2t, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dorcsd2by1 LAPACK_GLOBAL(dorcsd2by1,DORCSD2BY1) +void LAPACK_dorcsd2by1( + char const* jobu1, char const* jobu2, char const* jobv1t, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + double* X11, lapack_int const* ldx11, + double* X21, lapack_int const* ldx21, + double* theta, + double* U1, lapack_int const* ldu1, + double* U2, lapack_int const* ldu2, + double* V1T, lapack_int const* ldv1t, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sorcsd2by1 LAPACK_GLOBAL(sorcsd2by1,SORCSD2BY1) +void LAPACK_sorcsd2by1( + char const* jobu1, char const* jobu2, char const* jobv1t, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + float* X11, lapack_int const* ldx11, + float* X21, lapack_int const* ldx21, + float* theta, + float* U1, lapack_int const* ldu1, + float* U2, lapack_int const* ldu2, + float* V1T, lapack_int const* ldv1t, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr,DORGBR) +void LAPACK_dorgbr( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double* A, lapack_int const* lda, + double const* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr,SORGBR) +void LAPACK_sorgbr( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float* A, lapack_int const* lda, + float const* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dorghr LAPACK_GLOBAL(dorghr,DORGHR) +void LAPACK_dorghr( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double* A, lapack_int const* lda, + double const* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorghr LAPACK_GLOBAL(sorghr,SORGHR) +void LAPACK_sorghr( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float* A, lapack_int const* lda, + float const* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dorglq LAPACK_GLOBAL(dorglq,DORGLQ) +void LAPACK_dorglq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double* A, lapack_int const* lda, + double const* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorglq LAPACK_GLOBAL(sorglq,SORGLQ) +void LAPACK_sorglq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float* A, lapack_int const* lda, + float const* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dorgql LAPACK_GLOBAL(dorgql,DORGQL) +void LAPACK_dorgql( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double* A, lapack_int const* lda, + double const* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgql LAPACK_GLOBAL(sorgql,SORGQL) +void LAPACK_sorgql( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float* A, lapack_int const* lda, + float const* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr,DORGQR) +void LAPACK_dorgqr( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double* A, lapack_int const* lda, + double const* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr,SORGQR) +void LAPACK_sorgqr( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float* A, lapack_int const* lda, + float const* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq,DORGRQ) +void LAPACK_dorgrq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double* A, lapack_int const* lda, + double const* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq,SORGRQ) +void LAPACK_sorgrq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float* A, lapack_int const* lda, + float const* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr,DORGTR) +void LAPACK_dorgtr( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double const* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr,SORGTR) +void LAPACK_sorgtr( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float const* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR) +void LAPACK_dormbr( + char const* vect, char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormbr LAPACK_GLOBAL(sormbr,SORMBR) +void LAPACK_sormbr( + char const* vect, char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormhr LAPACK_GLOBAL(dormhr,DORMHR) +void LAPACK_dormhr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormhr LAPACK_GLOBAL(sormhr,SORMHR) +void LAPACK_sormhr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormlq LAPACK_GLOBAL(dormlq,DORMLQ) +void LAPACK_dormlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormlq LAPACK_GLOBAL(sormlq,SORMLQ) +void LAPACK_sormlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormql LAPACK_GLOBAL(dormql,DORMQL) +void LAPACK_dormql( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormql LAPACK_GLOBAL(sormql,SORMQL) +void LAPACK_sormql( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormqr LAPACK_GLOBAL(dormqr,DORMQR) +void LAPACK_dormqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormqr LAPACK_GLOBAL(sormqr,SORMQR) +void LAPACK_sormqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormrq LAPACK_GLOBAL(dormrq,DORMRQ) +void LAPACK_dormrq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormrq LAPACK_GLOBAL(sormrq,SORMRQ) +void LAPACK_sormrq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormrz LAPACK_GLOBAL(dormrz,DORMRZ) +void LAPACK_dormrz( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormrz LAPACK_GLOBAL(sormrz,SORMRZ) +void LAPACK_sormrz( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dormtr LAPACK_GLOBAL(dormtr,DORMTR) +void LAPACK_dormtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* tau, + double* C, lapack_int const* ldc, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sormtr LAPACK_GLOBAL(sormtr,SORMTR) +void LAPACK_sormtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* tau, + float* C, lapack_int const* ldc, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon,CPBCON) +void LAPACK_cpbcon( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float const* AB, lapack_int const* ldab, + float const* anorm, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon,DPBCON) +void LAPACK_dpbcon( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + double const* AB, lapack_int const* ldab, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_spbcon LAPACK_GLOBAL(spbcon,SPBCON) +void LAPACK_spbcon( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + float const* AB, lapack_int const* ldab, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon,ZPBCON) +void LAPACK_zpbcon( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double const* AB, lapack_int const* ldab, + double const* anorm, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ,CPBEQU) +void LAPACK_cpbequ( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float const* AB, lapack_int const* ldab, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ,DPBEQU) +void LAPACK_dpbequ( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + double const* AB, lapack_int const* ldab, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_spbequ LAPACK_GLOBAL(spbequ,SPBEQU) +void LAPACK_spbequ( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + float const* AB, lapack_int const* ldab, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ,ZPBEQU) +void LAPACK_zpbequ( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double const* AB, lapack_int const* ldab, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs,CPBRFS) +void LAPACK_cpbrfs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_float const* AB, lapack_int const* ldab, + lapack_complex_float const* AFB, lapack_int const* ldafb, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs,DPBRFS) +void LAPACK_dpbrfs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + double const* AB, lapack_int const* ldab, + double const* AFB, lapack_int const* ldafb, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs,SPBRFS) +void LAPACK_spbrfs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + float const* AB, lapack_int const* ldab, + float const* AFB, lapack_int const* ldafb, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs,ZPBRFS) +void LAPACK_zpbrfs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_double const* AB, lapack_int const* ldab, + lapack_complex_double const* AFB, lapack_int const* ldafb, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf,CPBSTF) +void LAPACK_cpbstf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf,DPBSTF) +void LAPACK_dpbstf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_spbstf LAPACK_GLOBAL(spbstf,SPBSTF) +void LAPACK_spbstf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf,ZPBSTF) +void LAPACK_zpbstf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv,CPBSV) +void LAPACK_cpbsv( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv,DPBSV) +void LAPACK_dpbsv( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + double* AB, lapack_int const* ldab, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_spbsv LAPACK_GLOBAL(spbsv,SPBSV) +void LAPACK_spbsv( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + float* AB, lapack_int const* ldab, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv,ZPBSV) +void LAPACK_zpbsv( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx,CPBSVX) +void LAPACK_cpbsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_complex_float* AFB, lapack_int const* ldafb, char* equed, + float* S, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx,DPBSVX) +void LAPACK_dpbsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + double* AB, lapack_int const* ldab, + double* AFB, lapack_int const* ldafb, char* equed, + double* S, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx,SPBSVX) +void LAPACK_spbsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + float* AB, lapack_int const* ldab, + float* AFB, lapack_int const* ldafb, char* equed, + float* S, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx,ZPBSVX) +void LAPACK_zpbsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_complex_double* AFB, lapack_int const* ldafb, char* equed, + double* S, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf,CPBTRF) +void LAPACK_cpbtrf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf,DPBTRF) +void LAPACK_dpbtrf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf,SPBTRF) +void LAPACK_spbtrf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf,ZPBTRF) +void LAPACK_zpbtrf( + char const* uplo, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double* AB, lapack_int const* ldab, + lapack_int* info ); + +#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs,CPBTRS) +void LAPACK_cpbtrs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_float const* AB, lapack_int const* ldab, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs,DPBTRS) +void LAPACK_dpbtrs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + double const* AB, lapack_int const* ldab, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs,SPBTRS) +void LAPACK_spbtrs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + float const* AB, lapack_int const* ldab, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs,ZPBTRS) +void LAPACK_zpbtrs( + char const* uplo, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_double const* AB, lapack_int const* ldab, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf,CPFTRF) +void LAPACK_cpftrf( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, + lapack_int* info ); + +#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf,DPFTRF) +void LAPACK_dpftrf( + char const* transr, char const* uplo, + lapack_int const* n, + double* A, + lapack_int* info ); + +#define LAPACK_spftrf LAPACK_GLOBAL(spftrf,SPFTRF) +void LAPACK_spftrf( + char const* transr, char const* uplo, + lapack_int const* n, + float* A, + lapack_int* info ); + +#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf,ZPFTRF) +void LAPACK_zpftrf( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, + lapack_int* info ); + +#define LAPACK_cpftri LAPACK_GLOBAL(cpftri,CPFTRI) +void LAPACK_cpftri( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_float* A, + lapack_int* info ); + +#define LAPACK_dpftri LAPACK_GLOBAL(dpftri,DPFTRI) +void LAPACK_dpftri( + char const* transr, char const* uplo, + lapack_int const* n, + double* A, + lapack_int* info ); + +#define LAPACK_spftri LAPACK_GLOBAL(spftri,SPFTRI) +void LAPACK_spftri( + char const* transr, char const* uplo, + lapack_int const* n, + float* A, + lapack_int* info ); + +#define LAPACK_zpftri LAPACK_GLOBAL(zpftri,ZPFTRI) +void LAPACK_zpftri( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_double* A, + lapack_int* info ); + +#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs,CPFTRS) +void LAPACK_cpftrs( + char const* transr, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs,DPFTRS) +void LAPACK_dpftrs( + char const* transr, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_spftrs LAPACK_GLOBAL(spftrs,SPFTRS) +void LAPACK_spftrs( + char const* transr, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs,ZPFTRS) +void LAPACK_zpftrs( + char const* transr, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cpocon LAPACK_GLOBAL(cpocon,CPOCON) +void LAPACK_cpocon( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float const* anorm, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dpocon LAPACK_GLOBAL(dpocon,DPOCON) +void LAPACK_dpocon( + char const* uplo, + lapack_int const* n, + double const* A, lapack_int const* lda, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_spocon LAPACK_GLOBAL(spocon,SPOCON) +void LAPACK_spocon( + char const* uplo, + lapack_int const* n, + float const* A, lapack_int const* lda, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zpocon LAPACK_GLOBAL(zpocon,ZPOCON) +void LAPACK_zpocon( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double const* anorm, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ,CPOEQU) +void LAPACK_cpoequ( + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ,DPOEQU) +void LAPACK_dpoequ( + lapack_int const* n, + double const* A, lapack_int const* lda, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_spoequ LAPACK_GLOBAL(spoequ,SPOEQU) +void LAPACK_spoequ( + lapack_int const* n, + float const* A, lapack_int const* lda, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ,ZPOEQU) +void LAPACK_zpoequ( + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb,CPOEQUB) +void LAPACK_cpoequb( + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb,DPOEQUB) +void LAPACK_dpoequb( + lapack_int const* n, + double const* A, lapack_int const* lda, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_spoequb LAPACK_GLOBAL(spoequb,SPOEQUB) +void LAPACK_spoequb( + lapack_int const* n, + float const* A, lapack_int const* lda, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb,ZPOEQUB) +void LAPACK_zpoequb( + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_cporfs LAPACK_GLOBAL(cporfs,CPORFS) +void LAPACK_cporfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dporfs LAPACK_GLOBAL(dporfs,DPORFS) +void LAPACK_dporfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* AF, lapack_int const* ldaf, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sporfs LAPACK_GLOBAL(sporfs,SPORFS) +void LAPACK_sporfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* AF, lapack_int const* ldaf, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zporfs LAPACK_GLOBAL(zporfs,ZPORFS) +void LAPACK_zporfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx,CPORFSX) +void LAPACK_cporfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, + float* S, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx,DPORFSX) +void LAPACK_dporfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* AF, lapack_int const* ldaf, + double* S, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx,SPORFSX) +void LAPACK_sporfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* AF, lapack_int const* ldaf, + float* S, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx,ZPORFSX) +void LAPACK_zporfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, + double* S, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cposv LAPACK_GLOBAL(cposv,CPOSV) +void LAPACK_cposv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dposv LAPACK_GLOBAL(dposv,DPOSV) +void LAPACK_dposv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sposv LAPACK_GLOBAL(sposv,SPOSV) +void LAPACK_sposv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zposv LAPACK_GLOBAL(zposv,ZPOSV) +void LAPACK_zposv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsposv LAPACK_GLOBAL(dsposv,DSPOSV) +void LAPACK_dsposv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* work, + float* swork, lapack_int* iter, + lapack_int* info ); + +#define LAPACK_zcposv LAPACK_GLOBAL(zcposv,ZCPOSV) +void LAPACK_zcposv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + lapack_complex_double* work, + lapack_complex_float* swork, + double* rwork, lapack_int* iter, + lapack_int* info ); + +#define LAPACK_cposvx LAPACK_GLOBAL(cposvx,CPOSVX) +void LAPACK_cposvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, char* equed, + float* S, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dposvx LAPACK_GLOBAL(dposvx,DPOSVX) +void LAPACK_dposvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* AF, lapack_int const* ldaf, char* equed, + double* S, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sposvx LAPACK_GLOBAL(sposvx,SPOSVX) +void LAPACK_sposvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* AF, lapack_int const* ldaf, char* equed, + float* S, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zposvx LAPACK_GLOBAL(zposvx,ZPOSVX) +void LAPACK_zposvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, char* equed, + double* S, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx,CPOSVXX) +void LAPACK_cposvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, char* equed, + float* S, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx,DPOSVXX) +void LAPACK_dposvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* AF, lapack_int const* ldaf, char* equed, + double* S, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx,SPOSVXX) +void LAPACK_sposvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* AF, lapack_int const* ldaf, char* equed, + float* S, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx,ZPOSVXX) +void LAPACK_zposvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, char* equed, + double* S, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpotf2 LAPACK_GLOBAL(cpotf2,CPOTF2) +void LAPACK_cpotf2( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dpotf2 LAPACK_GLOBAL(dpotf2,DPOTF2) +void LAPACK_dpotf2( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_spotf2 LAPACK_GLOBAL(spotf2,SPOTF2) +void LAPACK_spotf2( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_zpotf2 LAPACK_GLOBAL(zpotf2,ZPOTF2) +void LAPACK_zpotf2( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf,CPOTRF) +void LAPACK_cpotrf( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf,DPOTRF) +void LAPACK_dpotrf( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_spotrf LAPACK_GLOBAL(spotrf,SPOTRF) +void LAPACK_spotrf( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf,ZPOTRF) +void LAPACK_zpotrf( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_cpotrf2 LAPACK_GLOBAL(cpotrf2,CPOTRF2) +void LAPACK_cpotrf2( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dpotrf2 LAPACK_GLOBAL(dpotrf2,DPOTRF2) +void LAPACK_dpotrf2( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_spotrf2 LAPACK_GLOBAL(spotrf2,SPOTRF2) +void LAPACK_spotrf2( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_zpotrf2 LAPACK_GLOBAL(zpotrf2,ZPOTRF2) +void LAPACK_zpotrf2( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_cpotri LAPACK_GLOBAL(cpotri,CPOTRI) +void LAPACK_cpotri( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dpotri LAPACK_GLOBAL(dpotri,DPOTRI) +void LAPACK_dpotri( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_spotri LAPACK_GLOBAL(spotri,SPOTRI) +void LAPACK_spotri( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_zpotri LAPACK_GLOBAL(zpotri,ZPOTRI) +void LAPACK_zpotri( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs,CPOTRS) +void LAPACK_cpotrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs,DPOTRS) +void LAPACK_dpotrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_spotrs LAPACK_GLOBAL(spotrs,SPOTRS) +void LAPACK_spotrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs,ZPOTRS) +void LAPACK_zpotrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cppcon LAPACK_GLOBAL(cppcon,CPPCON) +void LAPACK_cppcon( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, + float const* anorm, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dppcon LAPACK_GLOBAL(dppcon,DPPCON) +void LAPACK_dppcon( + char const* uplo, + lapack_int const* n, + double const* AP, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sppcon LAPACK_GLOBAL(sppcon,SPPCON) +void LAPACK_sppcon( + char const* uplo, + lapack_int const* n, + float const* AP, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zppcon LAPACK_GLOBAL(zppcon,ZPPCON) +void LAPACK_zppcon( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, + double const* anorm, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cppequ LAPACK_GLOBAL(cppequ,CPPEQU) +void LAPACK_cppequ( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_dppequ LAPACK_GLOBAL(dppequ,DPPEQU) +void LAPACK_dppequ( + char const* uplo, + lapack_int const* n, + double const* AP, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_sppequ LAPACK_GLOBAL(sppequ,SPPEQU) +void LAPACK_sppequ( + char const* uplo, + lapack_int const* n, + float const* AP, + float* S, + float* scond, + float* amax, + lapack_int* info ); + +#define LAPACK_zppequ LAPACK_GLOBAL(zppequ,ZPPEQU) +void LAPACK_zppequ( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, + double* S, + double* scond, + double* amax, + lapack_int* info ); + +#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs,CPPRFS) +void LAPACK_cpprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float const* AFP, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs,DPPRFS) +void LAPACK_dpprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* AP, + double const* AFP, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_spprfs LAPACK_GLOBAL(spprfs,SPPRFS) +void LAPACK_spprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* AP, + float const* AFP, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs,ZPPRFS) +void LAPACK_zpprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double const* AFP, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cppsv LAPACK_GLOBAL(cppsv,CPPSV) +void LAPACK_cppsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* AP, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dppsv LAPACK_GLOBAL(dppsv,DPPSV) +void LAPACK_dppsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* AP, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sppsv LAPACK_GLOBAL(sppsv,SPPSV) +void LAPACK_sppsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* AP, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zppsv LAPACK_GLOBAL(zppsv,ZPPSV) +void LAPACK_zppsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* AP, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx,CPPSVX) +void LAPACK_cppsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* AP, + lapack_complex_float* AFP, char* equed, + float* S, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx,DPPSVX) +void LAPACK_dppsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* AP, + double* AFP, char* equed, + double* S, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx,SPPSVX) +void LAPACK_sppsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* AP, + float* AFP, char* equed, + float* S, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx,ZPPSVX) +void LAPACK_zppsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* AP, + lapack_complex_double* AFP, char* equed, + double* S, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf,CPPTRF) +void LAPACK_cpptrf( + char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + lapack_int* info ); + +#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf,DPPTRF) +void LAPACK_dpptrf( + char const* uplo, + lapack_int const* n, + double* AP, + lapack_int* info ); + +#define LAPACK_spptrf LAPACK_GLOBAL(spptrf,SPPTRF) +void LAPACK_spptrf( + char const* uplo, + lapack_int const* n, + float* AP, + lapack_int* info ); + +#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf,ZPPTRF) +void LAPACK_zpptrf( + char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + lapack_int* info ); + +#define LAPACK_cpptri LAPACK_GLOBAL(cpptri,CPPTRI) +void LAPACK_cpptri( + char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, + lapack_int* info ); + +#define LAPACK_dpptri LAPACK_GLOBAL(dpptri,DPPTRI) +void LAPACK_dpptri( + char const* uplo, + lapack_int const* n, + double* AP, + lapack_int* info ); + +#define LAPACK_spptri LAPACK_GLOBAL(spptri,SPPTRI) +void LAPACK_spptri( + char const* uplo, + lapack_int const* n, + float* AP, + lapack_int* info ); + +#define LAPACK_zpptri LAPACK_GLOBAL(zpptri,ZPPTRI) +void LAPACK_zpptri( + char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, + lapack_int* info ); + +#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs,CPPTRS) +void LAPACK_cpptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs,DPPTRS) +void LAPACK_dpptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* AP, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_spptrs LAPACK_GLOBAL(spptrs,SPPTRS) +void LAPACK_spptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* AP, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs,ZPPTRS) +void LAPACK_zpptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf,CPSTRF) +void LAPACK_cpstrf( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank, + float const* tol, + float* work, + lapack_int* info ); + +#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf,DPSTRF) +void LAPACK_dpstrf( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank, + double const* tol, + double* work, + lapack_int* info ); + +#define LAPACK_spstrf LAPACK_GLOBAL(spstrf,SPSTRF) +void LAPACK_spstrf( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank, + float const* tol, + float* work, + lapack_int* info ); + +#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf,ZPSTRF) +void LAPACK_zpstrf( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank, + double const* tol, + double* work, + lapack_int* info ); + +#define LAPACK_cptcon LAPACK_GLOBAL(cptcon,CPTCON) +void LAPACK_cptcon( + lapack_int const* n, + float const* D, + lapack_complex_float const* E, + float const* anorm, + float* rcond, + float* rwork, + lapack_int* info ); + +#define LAPACK_dptcon LAPACK_GLOBAL(dptcon,DPTCON) +void LAPACK_dptcon( + lapack_int const* n, + double const* D, + double const* E, + double const* anorm, + double* rcond, + double* work, + lapack_int* info ); + +#define LAPACK_sptcon LAPACK_GLOBAL(sptcon,SPTCON) +void LAPACK_sptcon( + lapack_int const* n, + float const* D, + float const* E, + float const* anorm, + float* rcond, + float* work, + lapack_int* info ); + +#define LAPACK_zptcon LAPACK_GLOBAL(zptcon,ZPTCON) +void LAPACK_zptcon( + lapack_int const* n, + double const* D, + lapack_complex_double const* E, + double const* anorm, + double* rcond, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr,CPTEQR) +void LAPACK_cpteqr( + char const* compz, + lapack_int const* n, + float* D, + float* E, + lapack_complex_float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr,DPTEQR) +void LAPACK_dpteqr( + char const* compz, + lapack_int const* n, + double* D, + double* E, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_spteqr LAPACK_GLOBAL(spteqr,SPTEQR) +void LAPACK_spteqr( + char const* compz, + lapack_int const* n, + float* D, + float* E, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr,ZPTEQR) +void LAPACK_zpteqr( + char const* compz, + lapack_int const* n, + double* D, + double* E, + lapack_complex_double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs,CPTRFS) +void LAPACK_cptrfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* D, + lapack_complex_float const* E, + float const* DF, + lapack_complex_float const* EF, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs,DPTRFS) +void LAPACK_dptrfs( + lapack_int const* n, lapack_int const* nrhs, + double const* D, + double const* E, + double const* DF, + double const* EF, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* info ); + +#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs,SPTRFS) +void LAPACK_sptrfs( + lapack_int const* n, lapack_int const* nrhs, + float const* D, + float const* E, + float const* DF, + float const* EF, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* info ); + +#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs,ZPTRFS) +void LAPACK_zptrfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* D, + lapack_complex_double const* E, + double const* DF, + lapack_complex_double const* EF, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cptsv LAPACK_GLOBAL(cptsv,CPTSV) +void LAPACK_cptsv( + lapack_int const* n, lapack_int const* nrhs, + float* D, + lapack_complex_float* E, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dptsv LAPACK_GLOBAL(dptsv,DPTSV) +void LAPACK_dptsv( + lapack_int const* n, lapack_int const* nrhs, + double* D, + double* E, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sptsv LAPACK_GLOBAL(sptsv,SPTSV) +void LAPACK_sptsv( + lapack_int const* n, lapack_int const* nrhs, + float* D, + float* E, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zptsv LAPACK_GLOBAL(zptsv,ZPTSV) +void LAPACK_zptsv( + lapack_int const* n, lapack_int const* nrhs, + double* D, + lapack_complex_double* E, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx,CPTSVX) +void LAPACK_cptsvx( + char const* fact, + lapack_int const* n, lapack_int const* nrhs, + float const* D, + lapack_complex_float const* E, + float* DF, + lapack_complex_float* EF, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx,DPTSVX) +void LAPACK_dptsvx( + char const* fact, + lapack_int const* n, lapack_int const* nrhs, + double const* D, + double const* E, + double* DF, + double* EF, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* info ); + +#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx,SPTSVX) +void LAPACK_sptsvx( + char const* fact, + lapack_int const* n, lapack_int const* nrhs, + float const* D, + float const* E, + float* DF, + float* EF, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* info ); + +#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx,ZPTSVX) +void LAPACK_zptsvx( + char const* fact, + lapack_int const* n, lapack_int const* nrhs, + double const* D, + lapack_complex_double const* E, + double* DF, + lapack_complex_double* EF, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf,CPTTRF) +void LAPACK_cpttrf( + lapack_int const* n, + float* D, + lapack_complex_float* E, + lapack_int* info ); + +#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf,DPTTRF) +void LAPACK_dpttrf( + lapack_int const* n, + double* D, + double* E, + lapack_int* info ); + +#define LAPACK_spttrf LAPACK_GLOBAL(spttrf,SPTTRF) +void LAPACK_spttrf( + lapack_int const* n, + float* D, + float* E, + lapack_int* info ); + +#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf,ZPTTRF) +void LAPACK_zpttrf( + lapack_int const* n, + double* D, + lapack_complex_double* E, + lapack_int* info ); + +#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs,CPTTRS) +void LAPACK_cpttrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* D, + lapack_complex_float const* E, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs,DPTTRS) +void LAPACK_dpttrs( + lapack_int const* n, lapack_int const* nrhs, + double const* D, + double const* E, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_spttrs LAPACK_GLOBAL(spttrs,SPTTRS) +void LAPACK_spttrs( + lapack_int const* n, lapack_int const* nrhs, + float const* D, + float const* E, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs,ZPTTRS) +void LAPACK_zpttrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* D, + lapack_complex_double const* E, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsbev LAPACK_GLOBAL(dsbev,DSBEV) +void LAPACK_dsbev( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_ssbev LAPACK_GLOBAL(ssbev,SSBEV) +void LAPACK_ssbev( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_dsbev_2stage LAPACK_GLOBAL(dsbev_2stage,DSBEV_2STAGE) +void LAPACK_dsbev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssbev_2stage LAPACK_GLOBAL(ssbev_2stage,SSBEV_2STAGE) +void LAPACK_ssbev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd,DSBEVD) +void LAPACK_dsbevd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd,SSBEVD) +void LAPACK_ssbevd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsbevd_2stage LAPACK_GLOBAL(dsbevd_2stage,DSBEVD_2STAGE) +void LAPACK_dsbevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssbevd_2stage LAPACK_GLOBAL(ssbevd_2stage,SSBEVD_2STAGE) +void LAPACK_ssbevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx,DSBEVX) +void LAPACK_dsbevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + double* Q, lapack_int const* ldq, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx,SSBEVX) +void LAPACK_ssbevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + float* Q, lapack_int const* ldq, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_dsbevx_2stage LAPACK_GLOBAL(dsbevx_2stage,DSBEVX_2STAGE) +void LAPACK_dsbevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + double* Q, lapack_int const* ldq, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_ssbevx_2stage LAPACK_GLOBAL(ssbevx_2stage,SSBEVX_2STAGE) +void LAPACK_ssbevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + float* Q, lapack_int const* ldq, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst,DSBGST) +void LAPACK_dsbgst( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + double* AB, lapack_int const* ldab, + double const* BB, lapack_int const* ldbb, + double* X, lapack_int const* ldx, + double* work, + lapack_int* info ); + +#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst,SSBGST) +void LAPACK_ssbgst( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + float* AB, lapack_int const* ldab, + float const* BB, lapack_int const* ldbb, + float* X, lapack_int const* ldx, + float* work, + lapack_int* info ); + +#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv,DSBGV) +void LAPACK_dsbgv( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + double* AB, lapack_int const* ldab, + double* BB, lapack_int const* ldbb, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv,SSBGV) +void LAPACK_ssbgv( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + float* AB, lapack_int const* ldab, + float* BB, lapack_int const* ldbb, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd,DSBGVD) +void LAPACK_dsbgvd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + double* AB, lapack_int const* ldab, + double* BB, lapack_int const* ldbb, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd,SSBGVD) +void LAPACK_ssbgvd( + char const* jobz, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + float* AB, lapack_int const* ldab, + float* BB, lapack_int const* ldbb, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx,DSBGVX) +void LAPACK_dsbgvx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + double* AB, lapack_int const* ldab, + double* BB, lapack_int const* ldbb, + double* Q, lapack_int const* ldq, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx,SSBGVX) +void LAPACK_ssbgvx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, lapack_int const* ka, lapack_int const* kb, + float* AB, lapack_int const* ldab, + float* BB, lapack_int const* ldbb, + float* Q, lapack_int const* ldq, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd,DSBTRD) +void LAPACK_dsbtrd( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* kd, + double* AB, lapack_int const* ldab, + double* D, + double* E, + double* Q, lapack_int const* ldq, + double* work, + lapack_int* info ); + +#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd,SSBTRD) +void LAPACK_ssbtrd( + char const* vect, char const* uplo, + lapack_int const* n, lapack_int const* kd, + float* AB, lapack_int const* ldab, + float* D, + float* E, + float* Q, lapack_int const* ldq, + float* work, + lapack_int* info ); + +#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk,DSFRK) +void LAPACK_dsfrk( + char const* transr, char const* uplo, char const* trans, + lapack_int const* n, lapack_int const* k, + double const* alpha, + double const* A, lapack_int const* lda, + double const* beta, + double* C ); + +#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk,SSFRK) +void LAPACK_ssfrk( + char const* transr, char const* uplo, char const* trans, + lapack_int const* n, lapack_int const* k, + float const* alpha, + float const* A, lapack_int const* lda, + float const* beta, + float* C ); + +#define LAPACK_cspcon LAPACK_GLOBAL(cspcon,CSPCON) +void LAPACK_cspcon( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dspcon LAPACK_GLOBAL(dspcon,DSPCON) +void LAPACK_dspcon( + char const* uplo, + lapack_int const* n, + double const* AP, lapack_int const* ipiv, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sspcon LAPACK_GLOBAL(sspcon,SSPCON) +void LAPACK_sspcon( + char const* uplo, + lapack_int const* n, + float const* AP, lapack_int const* ipiv, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zspcon LAPACK_GLOBAL(zspcon,ZSPCON) +void LAPACK_zspcon( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_dspev LAPACK_GLOBAL(dspev,DSPEV) +void LAPACK_dspev( + char const* jobz, char const* uplo, + lapack_int const* n, + double* AP, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_sspev LAPACK_GLOBAL(sspev,SSPEV) +void LAPACK_sspev( + char const* jobz, char const* uplo, + lapack_int const* n, + float* AP, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_dspevd LAPACK_GLOBAL(dspevd,DSPEVD) +void LAPACK_dspevd( + char const* jobz, char const* uplo, + lapack_int const* n, + double* AP, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sspevd LAPACK_GLOBAL(sspevd,SSPEVD) +void LAPACK_sspevd( + char const* jobz, char const* uplo, + lapack_int const* n, + float* AP, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dspevx LAPACK_GLOBAL(dspevx,DSPEVX) +void LAPACK_dspevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + double* AP, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_sspevx LAPACK_GLOBAL(sspevx,SSPEVX) +void LAPACK_sspevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + float* AP, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_dspgst LAPACK_GLOBAL(dspgst,DSPGST) +void LAPACK_dspgst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + double* AP, + double const* BP, + lapack_int* info ); + +#define LAPACK_sspgst LAPACK_GLOBAL(sspgst,SSPGST) +void LAPACK_sspgst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + float* AP, + float const* BP, + lapack_int* info ); + +#define LAPACK_dspgv LAPACK_GLOBAL(dspgv,DSPGV) +void LAPACK_dspgv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + double* AP, + double* BP, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_sspgv LAPACK_GLOBAL(sspgv,SSPGV) +void LAPACK_sspgv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + float* AP, + float* BP, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd,DSPGVD) +void LAPACK_dspgvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + double* AP, + double* BP, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd,SSPGVD) +void LAPACK_sspgvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + float* AP, + float* BP, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx,DSPGVX) +void LAPACK_dspgvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + double* AP, + double* BP, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx,SSPGVX) +void LAPACK_sspgvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + float* AP, + float* BP, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_csprfs LAPACK_GLOBAL(csprfs,CSPRFS) +void LAPACK_csprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float const* AFP, lapack_int const* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs,DSPRFS) +void LAPACK_dsprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* AP, + double const* AFP, lapack_int const* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs,SSPRFS) +void LAPACK_ssprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* AP, + float const* AFP, lapack_int const* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs,ZSPRFS) +void LAPACK_zsprfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double const* AFP, lapack_int const* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_cspsv LAPACK_GLOBAL(cspsv,CSPSV) +void LAPACK_cspsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* AP, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dspsv LAPACK_GLOBAL(dspsv,DSPSV) +void LAPACK_dspsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* AP, lapack_int* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_sspsv LAPACK_GLOBAL(sspsv,SSPSV) +void LAPACK_sspsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* AP, lapack_int* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zspsv LAPACK_GLOBAL(zspsv,ZSPSV) +void LAPACK_zspsv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* AP, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx,CSPSVX) +void LAPACK_cspsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float* AFP, lapack_int* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx,DSPSVX) +void LAPACK_dspsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* AP, + double* AFP, lapack_int* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx,SSPSVX) +void LAPACK_sspsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* AP, + float* AFP, lapack_int* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx,ZSPSVX) +void LAPACK_zspsvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double* AFP, lapack_int* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd,DSPTRD) +void LAPACK_dsptrd( + char const* uplo, + lapack_int const* n, + double* AP, + double* D, + double* E, + double* tau, + lapack_int* info ); + +#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd,SSPTRD) +void LAPACK_ssptrd( + char const* uplo, + lapack_int const* n, + float* AP, + float* D, + float* E, + float* tau, + lapack_int* info ); + +#define LAPACK_csptrf LAPACK_GLOBAL(csptrf,CSPTRF) +void LAPACK_csptrf( + char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf,DSPTRF) +void LAPACK_dsptrf( + char const* uplo, + lapack_int const* n, + double* AP, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf,SSPTRF) +void LAPACK_ssptrf( + char const* uplo, + lapack_int const* n, + float* AP, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf,ZSPTRF) +void LAPACK_zsptrf( + char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, lapack_int* ipiv, + lapack_int* info ); + +#define LAPACK_csptri LAPACK_GLOBAL(csptri,CSPTRI) +void LAPACK_csptri( + char const* uplo, + lapack_int const* n, + lapack_complex_float* AP, lapack_int const* ipiv, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dsptri LAPACK_GLOBAL(dsptri,DSPTRI) +void LAPACK_dsptri( + char const* uplo, + lapack_int const* n, + double* AP, lapack_int const* ipiv, + double* work, + lapack_int* info ); + +#define LAPACK_ssptri LAPACK_GLOBAL(ssptri,SSPTRI) +void LAPACK_ssptri( + char const* uplo, + lapack_int const* n, + float* AP, lapack_int const* ipiv, + float* work, + lapack_int* info ); + +#define LAPACK_zsptri LAPACK_GLOBAL(zsptri,ZSPTRI) +void LAPACK_zsptri( + char const* uplo, + lapack_int const* n, + lapack_complex_double* AP, lapack_int const* ipiv, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_csptrs LAPACK_GLOBAL(csptrs,CSPTRS) +void LAPACK_csptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs,DSPTRS) +void LAPACK_dsptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* AP, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs,SSPTRS) +void LAPACK_ssptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* AP, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs,ZSPTRS) +void LAPACK_zsptrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dstebz LAPACK_GLOBAL(dstebz,DSTEBZ) +void LAPACK_dstebz( + char const* range, char const* order, + lapack_int const* n, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, + double const* D, + double const* E, lapack_int* m, lapack_int* nsplit, + double* W, lapack_int* IBLOCK, lapack_int* ISPLIT, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_sstebz LAPACK_GLOBAL(sstebz,SSTEBZ) +void LAPACK_sstebz( + char const* range, char const* order, + lapack_int const* n, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, + float const* D, + float const* E, lapack_int* m, lapack_int* nsplit, + float* W, lapack_int* IBLOCK, lapack_int* ISPLIT, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cstedc LAPACK_GLOBAL(cstedc,CSTEDC) +void LAPACK_cstedc( + char const* compz, + lapack_int const* n, + float* D, + float* E, + lapack_complex_float* Z, lapack_int const* ldz, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dstedc LAPACK_GLOBAL(dstedc,DSTEDC) +void LAPACK_dstedc( + char const* compz, + lapack_int const* n, + double* D, + double* E, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sstedc LAPACK_GLOBAL(sstedc,SSTEDC) +void LAPACK_sstedc( + char const* compz, + lapack_int const* n, + float* D, + float* E, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zstedc LAPACK_GLOBAL(zstedc,ZSTEDC) +void LAPACK_zstedc( + char const* compz, + lapack_int const* n, + double* D, + double* E, + lapack_complex_double* Z, lapack_int const* ldz, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_cstegr LAPACK_GLOBAL(cstegr,CSTEGR) +void LAPACK_cstegr( + char const* jobz, char const* range, + lapack_int const* n, + float* D, + float* E, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dstegr LAPACK_GLOBAL(dstegr,DSTEGR) +void LAPACK_dstegr( + char const* jobz, char const* range, + lapack_int const* n, + double* D, + double* E, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sstegr LAPACK_GLOBAL(sstegr,SSTEGR) +void LAPACK_sstegr( + char const* jobz, char const* range, + lapack_int const* n, + float* D, + float* E, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zstegr LAPACK_GLOBAL(zstegr,ZSTEGR) +void LAPACK_zstegr( + char const* jobz, char const* range, + lapack_int const* n, + double* D, + double* E, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_cstein LAPACK_GLOBAL(cstein,CSTEIN) +void LAPACK_cstein( + lapack_int const* n, + float const* D, + float const* E, lapack_int const* m, + float const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT, + lapack_complex_float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_dstein LAPACK_GLOBAL(dstein,DSTEIN) +void LAPACK_dstein( + lapack_int const* n, + double const* D, + double const* E, lapack_int const* m, + double const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_sstein LAPACK_GLOBAL(sstein,SSTEIN) +void LAPACK_sstein( + lapack_int const* n, + float const* D, + float const* E, lapack_int const* m, + float const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_zstein LAPACK_GLOBAL(zstein,ZSTEIN) +void LAPACK_zstein( + lapack_int const* n, + double const* D, + double const* E, lapack_int const* m, + double const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT, + lapack_complex_double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_cstemr LAPACK_GLOBAL(cstemr,CSTEMR) +void LAPACK_cstemr( + char const* jobz, char const* range, + lapack_int const* n, + float* D, + float* E, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m, + float* W, + lapack_complex_float* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dstemr LAPACK_GLOBAL(dstemr,DSTEMR) +void LAPACK_dstemr( + char const* jobz, char const* range, + lapack_int const* n, + double* D, + double* E, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sstemr LAPACK_GLOBAL(sstemr,SSTEMR) +void LAPACK_sstemr( + char const* jobz, char const* range, + lapack_int const* n, + float* D, + float* E, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zstemr LAPACK_GLOBAL(zstemr,ZSTEMR) +void LAPACK_zstemr( + char const* jobz, char const* range, + lapack_int const* n, + double* D, + double* E, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m, + double* W, + lapack_complex_double* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_csteqr LAPACK_GLOBAL(csteqr,CSTEQR) +void LAPACK_csteqr( + char const* compz, + lapack_int const* n, + float* D, + float* E, + lapack_complex_float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr,DSTEQR) +void LAPACK_dsteqr( + char const* compz, + lapack_int const* n, + double* D, + double* E, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr,SSTEQR) +void LAPACK_ssteqr( + char const* compz, + lapack_int const* n, + float* D, + float* E, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr,ZSTEQR) +void LAPACK_zsteqr( + char const* compz, + lapack_int const* n, + double* D, + double* E, + lapack_complex_double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_dsterf LAPACK_GLOBAL(dsterf,DSTERF) +void LAPACK_dsterf( + lapack_int const* n, + double* D, + double* E, + lapack_int* info ); + +#define LAPACK_ssterf LAPACK_GLOBAL(ssterf,SSTERF) +void LAPACK_ssterf( + lapack_int const* n, + float* D, + float* E, + lapack_int* info ); + +#define LAPACK_dstev LAPACK_GLOBAL(dstev,DSTEV) +void LAPACK_dstev( + char const* jobz, + lapack_int const* n, + double* D, + double* E, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* info ); + +#define LAPACK_sstev LAPACK_GLOBAL(sstev,SSTEV) +void LAPACK_sstev( + char const* jobz, + lapack_int const* n, + float* D, + float* E, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* info ); + +#define LAPACK_dstevd LAPACK_GLOBAL(dstevd,DSTEVD) +void LAPACK_dstevd( + char const* jobz, + lapack_int const* n, + double* D, + double* E, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sstevd LAPACK_GLOBAL(sstevd,SSTEVD) +void LAPACK_sstevd( + char const* jobz, + lapack_int const* n, + float* D, + float* E, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dstevr LAPACK_GLOBAL(dstevr,DSTEVR) +void LAPACK_dstevr( + char const* jobz, char const* range, + lapack_int const* n, + double* D, + double* E, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sstevr LAPACK_GLOBAL(sstevr,SSTEVR) +void LAPACK_sstevr( + char const* jobz, char const* range, + lapack_int const* n, + float* D, + float* E, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dstevx LAPACK_GLOBAL(dstevx,DSTEVX) +void LAPACK_dstevx( + char const* jobz, char const* range, + lapack_int const* n, + double* D, + double* E, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_sstevx LAPACK_GLOBAL(sstevx,SSTEVX) +void LAPACK_sstevx( + char const* jobz, char const* range, + lapack_int const* n, + float* D, + float* E, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_csycon LAPACK_GLOBAL(csycon,CSYCON) +void LAPACK_csycon( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dsycon LAPACK_GLOBAL(dsycon,DSYCON) +void LAPACK_dsycon( + char const* uplo, + lapack_int const* n, + double const* A, lapack_int const* lda, lapack_int const* ipiv, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ssycon LAPACK_GLOBAL(ssycon,SSYCON) +void LAPACK_ssycon( + char const* uplo, + lapack_int const* n, + float const* A, lapack_int const* lda, lapack_int const* ipiv, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zsycon LAPACK_GLOBAL(zsycon,ZSYCON) +void LAPACK_zsycon( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_csycon_3 LAPACK_GLOBAL(csycon_3,CSYCON_3) +void LAPACK_csycon_3( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* E, lapack_int const* ipiv, + float const* anorm, + float* rcond, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dsycon_3 LAPACK_GLOBAL(dsycon_3,DSYCON_3) +void LAPACK_dsycon_3( + char const* uplo, + lapack_int const* n, + double const* A, lapack_int const* lda, + double const* E, lapack_int const* ipiv, + double const* anorm, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ssycon_3 LAPACK_GLOBAL(ssycon_3,SSYCON_3) +void LAPACK_ssycon_3( + char const* uplo, + lapack_int const* n, + float const* A, lapack_int const* lda, + float const* E, lapack_int const* ipiv, + float const* anorm, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zsycon_3 LAPACK_GLOBAL(zsycon_3,ZSYCON_3) +void LAPACK_zsycon_3( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* E, lapack_int const* ipiv, + double const* anorm, + double* rcond, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_csyconv LAPACK_GLOBAL(csyconv,CSYCONV) +void LAPACK_csyconv( + char const* uplo, char const* way, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* E, + lapack_int* info ); + +#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv,DSYCONV) +void LAPACK_dsyconv( + char const* uplo, char const* way, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int const* ipiv, + double* E, + lapack_int* info ); + +#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv,SSYCONV) +void LAPACK_ssyconv( + char const* uplo, char const* way, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int const* ipiv, + float* E, + lapack_int* info ); + +#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv,ZSYCONV) +void LAPACK_zsyconv( + char const* uplo, char const* way, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* E, + lapack_int* info ); + +#define LAPACK_csyequb LAPACK_GLOBAL(csyequb,CSYEQUB) +void LAPACK_csyequb( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* S, + float* scond, + float* amax, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb,DSYEQUB) +void LAPACK_dsyequb( + char const* uplo, + lapack_int const* n, + double const* A, lapack_int const* lda, + double* S, + double* scond, + double* amax, + double* work, + lapack_int* info ); + +#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb,SSYEQUB) +void LAPACK_ssyequb( + char const* uplo, + lapack_int const* n, + float const* A, lapack_int const* lda, + float* S, + float* scond, + float* amax, + float* work, + lapack_int* info ); + +#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb,ZSYEQUB) +void LAPACK_zsyequb( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* S, + double* scond, + double* amax, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_dsyev LAPACK_GLOBAL(dsyev,DSYEV) +void LAPACK_dsyev( + char const* jobz, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* W, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssyev LAPACK_GLOBAL(ssyev,SSYEV) +void LAPACK_ssyev( + char const* jobz, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* W, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsyev_2stage LAPACK_GLOBAL(dsyev_2stage,DSYEV_2STAGE) +void LAPACK_dsyev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* W, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssyev_2stage LAPACK_GLOBAL(ssyev_2stage,SSYEV_2STAGE) +void LAPACK_ssyev_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* W, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd,DSYEVD) +void LAPACK_dsyevd( + char const* jobz, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* W, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd,SSYEVD) +void LAPACK_ssyevd( + char const* jobz, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* W, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsyevd_2stage LAPACK_GLOBAL(dsyevd_2stage,DSYEVD_2STAGE) +void LAPACK_dsyevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* W, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssyevd_2stage LAPACK_GLOBAL(ssyevd_2stage,SSYEVD_2STAGE) +void LAPACK_ssyevd_2stage( + char const* jobz, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* W, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr,DSYEVR) +void LAPACK_dsyevr( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr,SSYEVR) +void LAPACK_ssyevr( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsyevr_2stage LAPACK_GLOBAL(dsyevr_2stage,DSYEVR_2STAGE) +void LAPACK_dsyevr_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssyevr_2stage LAPACK_GLOBAL(ssyevr_2stage,SSYEVR_2STAGE) +void LAPACK_ssyevr_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, lapack_int* ISUPPZ, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx,DSYEVX) +void LAPACK_dsyevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx,SSYEVX) +void LAPACK_ssyevx( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_dsyevx_2stage LAPACK_GLOBAL(dsyevx_2stage,DSYEVX_2STAGE) +void LAPACK_dsyevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_ssyevx_2stage LAPACK_GLOBAL(ssyevx_2stage,SSYEVX_2STAGE) +void LAPACK_ssyevx_2stage( + char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_dsygst LAPACK_GLOBAL(dsygst,DSYGST) +void LAPACK_dsygst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ssygst LAPACK_GLOBAL(ssygst,SSYGST) +void LAPACK_ssygst( + lapack_int const* itype, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsygv LAPACK_GLOBAL(dsygv,DSYGV) +void LAPACK_dsygv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* W, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssygv LAPACK_GLOBAL(ssygv,SSYGV) +void LAPACK_ssygv( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* W, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsygv_2stage LAPACK_GLOBAL(dsygv_2stage,DSYGV_2STAGE) +void LAPACK_dsygv_2stage( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* W, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssygv_2stage LAPACK_GLOBAL(ssygv_2stage,SSYGV_2STAGE) +void LAPACK_ssygv_2stage( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* W, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd,DSYGVD) +void LAPACK_dsygvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* W, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd,SSYGVD) +void LAPACK_ssygvd( + lapack_int const* itype, char const* jobz, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* W, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx,DSYGVX) +void LAPACK_dsygvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double const* vl, + double const* vu, lapack_int const* il, lapack_int const* iu, + double const* abstol, lapack_int* m, + double* W, + double* Z, lapack_int const* ldz, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx,SSYGVX) +void LAPACK_ssygvx( + lapack_int const* itype, char const* jobz, char const* range, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float const* vl, + float const* vu, lapack_int const* il, lapack_int const* iu, + float const* abstol, lapack_int* m, + float* W, + float* Z, lapack_int const* ldz, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int* IFAIL, + lapack_int* info ); + +#define LAPACK_csyr LAPACK_GLOBAL(csyr,CSYR) +void LAPACK_csyr( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* alpha, + lapack_complex_float const* X, lapack_int const* incx, + lapack_complex_float* A, lapack_int const* lda ); + +#define LAPACK_zsyr LAPACK_GLOBAL(zsyr,ZSYR) +void LAPACK_zsyr( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* alpha, + lapack_complex_double const* X, lapack_int const* incx, + lapack_complex_double* A, lapack_int const* lda ); + +#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs,CSYRFS) +void LAPACK_csyrfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs,DSYRFS) +void LAPACK_dsyrfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs,SSYRFS) +void LAPACK_ssyrfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs,ZSYRFS) +void LAPACK_zsyrfs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx,CSYRFSX) +void LAPACK_csyrfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + float* S, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx,DSYRFSX) +void LAPACK_dsyrfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + double* S, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx,SSYRFSX) +void LAPACK_ssyrfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + float* S, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx,ZSYRFSX) +void LAPACK_zsyrfsx( + char const* uplo, char const* equed, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, + double* S, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_csysv LAPACK_GLOBAL(csysv,CSYSV) +void LAPACK_csysv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsysv LAPACK_GLOBAL(dsysv,DSYSV) +void LAPACK_dsysv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, lapack_int* ipiv, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssysv LAPACK_GLOBAL(ssysv,SSYSV) +void LAPACK_ssysv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, lapack_int* ipiv, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsysv LAPACK_GLOBAL(zsysv,ZSYSV) +void LAPACK_zsysv( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csysv_aa LAPACK_GLOBAL(csysv_aa,CSYSV_AA) +void LAPACK_csysv_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsysv_aa LAPACK_GLOBAL(dsysv_aa,DSYSV_AA) +void LAPACK_dsysv_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, lapack_int* ipiv, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssysv_aa LAPACK_GLOBAL(ssysv_aa,SSYSV_AA) +void LAPACK_ssysv_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, lapack_int* ipiv, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsysv_aa LAPACK_GLOBAL(zsysv_aa,ZSYSV_AA) +void LAPACK_zsysv_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csysv_aa_2stage LAPACK_GLOBAL(csysv_aa_2stage,CSYSV_AA_2STAGE) +void LAPACK_csysv_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsysv_aa_2stage LAPACK_GLOBAL(dsysv_aa_2stage,DSYSV_AA_2STAGE) +void LAPACK_dsysv_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssysv_aa_2stage LAPACK_GLOBAL(ssysv_aa_2stage,SSYSV_AA_2STAGE) +void LAPACK_ssysv_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsysv_aa_2stage LAPACK_GLOBAL(zsysv_aa_2stage,ZSYSV_AA_2STAGE) +void LAPACK_zsysv_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csysv_rk LAPACK_GLOBAL(csysv_rk,CSYSV_RK) +void LAPACK_csysv_rk( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* E, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsysv_rk LAPACK_GLOBAL(dsysv_rk,DSYSV_RK) +void LAPACK_dsysv_rk( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* E, lapack_int* ipiv, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssysv_rk LAPACK_GLOBAL(ssysv_rk,SSYSV_RK) +void LAPACK_ssysv_rk( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* E, lapack_int* ipiv, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsysv_rk LAPACK_GLOBAL(zsysv_rk,ZSYSV_RK) +void LAPACK_zsysv_rk( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* E, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csysv_rook LAPACK_GLOBAL(csysv_rook,CSYSV_ROOK) +void LAPACK_csysv_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsysv_rook LAPACK_GLOBAL(dsysv_rook,DSYSV_ROOK) +void LAPACK_dsysv_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, lapack_int* ipiv, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssysv_rook LAPACK_GLOBAL(ssysv_rook,SSYSV_ROOK) +void LAPACK_ssysv_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, lapack_int* ipiv, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsysv_rook LAPACK_GLOBAL(zsysv_rook,ZSYSV_ROOK) +void LAPACK_zsysv_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csysvx LAPACK_GLOBAL(csysvx,CSYSVX) +void LAPACK_csysvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx,DSYSVX) +void LAPACK_dsysvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double* AF, lapack_int const* ldaf, lapack_int* ipiv, + double const* B, lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx,SSYSVX) +void LAPACK_ssysvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float* AF, lapack_int const* ldaf, lapack_int* ipiv, + float const* B, lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* ferr, + float* berr, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx,ZSYSVX) +void LAPACK_zsysvx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* ferr, + double* berr, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx,CSYSVXX) +void LAPACK_csysvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + float* S, + lapack_complex_float* B, + lapack_int const* ldb, + lapack_complex_float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx,DSYSVXX) +void LAPACK_dsysvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, + double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + double* S, + double* B, + lapack_int const* ldb, + double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx,SSYSVXX) +void LAPACK_ssysvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, + float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + float* S, + float* B, + lapack_int const* ldb, + float* X, lapack_int const* ldx, + float* rcond, + float* rpvgrw, + float* berr, lapack_int const* n_err_bnds, + float* err_bnds_norm, + float* err_bnds_comp, lapack_int const* nparams, + float* params, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx,ZSYSVXX) +void LAPACK_zsysvxx( + char const* fact, char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed, + double* S, + lapack_complex_double* B, + lapack_int const* ldb, + lapack_complex_double* X, lapack_int const* ldx, + double* rcond, + double* rpvgrw, + double* berr, lapack_int const* n_err_bnds, + double* err_bnds_norm, + double* err_bnds_comp, lapack_int const* nparams, + double* params, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr,CSYSWAPR) +void LAPACK_csyswapr( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 ); + +#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr,DSYSWAPR) +void LAPACK_dsyswapr( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 ); + +#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr,SSYSWAPR) +void LAPACK_ssyswapr( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 ); + +#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr,ZSYSWAPR) +void LAPACK_zsyswapr( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 ); + +#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd,DSYTRD) +void LAPACK_dsytrd( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* D, + double* E, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd,SSYTRD) +void LAPACK_ssytrd( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* D, + float* E, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytrd_2stage LAPACK_GLOBAL(dsytrd_2stage,DSYTRD_2STAGE) +void LAPACK_dsytrd_2stage( + char const* vect, char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* D, + double* E, + double* tau, + double* HOUS2, lapack_int const* lhous2, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrd_2stage LAPACK_GLOBAL(ssytrd_2stage,SSYTRD_2STAGE) +void LAPACK_ssytrd_2stage( + char const* vect, char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* D, + float* E, + float* tau, + float* HOUS2, lapack_int const* lhous2, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytrf LAPACK_GLOBAL(csytrf,CSYTRF) +void LAPACK_csytrf( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf,DSYTRF) +void LAPACK_dsytrf( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* ipiv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf,SSYTRF) +void LAPACK_ssytrf( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* ipiv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf,ZSYTRF) +void LAPACK_zsytrf( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytrf_aa LAPACK_GLOBAL(csytrf_aa,CSYTRF_AA) +void LAPACK_csytrf_aa( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytrf_aa LAPACK_GLOBAL(dsytrf_aa,DSYTRF_AA) +void LAPACK_dsytrf_aa( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* ipiv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrf_aa LAPACK_GLOBAL(ssytrf_aa,SSYTRF_AA) +void LAPACK_ssytrf_aa( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* ipiv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytrf_aa LAPACK_GLOBAL(zsytrf_aa,ZSYTRF_AA) +void LAPACK_zsytrf_aa( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytrf_aa_2stage LAPACK_GLOBAL(csytrf_aa_2stage,CSYTRF_AA_2STAGE) +void LAPACK_csytrf_aa_2stage( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytrf_aa_2stage LAPACK_GLOBAL(dsytrf_aa_2stage,DSYTRF_AA_2STAGE) +void LAPACK_dsytrf_aa_2stage( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrf_aa_2stage LAPACK_GLOBAL(ssytrf_aa_2stage,SSYTRF_AA_2STAGE) +void LAPACK_ssytrf_aa_2stage( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytrf_aa_2stage LAPACK_GLOBAL(zsytrf_aa_2stage,ZSYTRF_AA_2STAGE) +void LAPACK_zsytrf_aa_2stage( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytrf_rk LAPACK_GLOBAL(csytrf_rk,CSYTRF_RK) +void LAPACK_csytrf_rk( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* E, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytrf_rk LAPACK_GLOBAL(dsytrf_rk,DSYTRF_RK) +void LAPACK_dsytrf_rk( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double* E, lapack_int* ipiv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrf_rk LAPACK_GLOBAL(ssytrf_rk,SSYTRF_RK) +void LAPACK_ssytrf_rk( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float* E, lapack_int* ipiv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytrf_rk LAPACK_GLOBAL(zsytrf_rk,ZSYTRF_RK) +void LAPACK_zsytrf_rk( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* E, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytrf_rook LAPACK_GLOBAL(csytrf_rook,CSYTRF_ROOK) +void LAPACK_csytrf_rook( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytrf_rook LAPACK_GLOBAL(dsytrf_rook,DSYTRF_ROOK) +void LAPACK_dsytrf_rook( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int* ipiv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrf_rook LAPACK_GLOBAL(ssytrf_rook,SSYTRF_ROOK) +void LAPACK_ssytrf_rook( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int* ipiv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytrf_rook LAPACK_GLOBAL(zsytrf_rook,ZSYTRF_ROOK) +void LAPACK_zsytrf_rook( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytri LAPACK_GLOBAL(csytri,CSYTRI) +void LAPACK_csytri( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dsytri LAPACK_GLOBAL(dsytri,DSYTRI) +void LAPACK_dsytri( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int const* ipiv, + double* work, + lapack_int* info ); + +#define LAPACK_ssytri LAPACK_GLOBAL(ssytri,SSYTRI) +void LAPACK_ssytri( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int const* ipiv, + float* work, + lapack_int* info ); + +#define LAPACK_zsytri LAPACK_GLOBAL(zsytri,ZSYTRI) +void LAPACK_zsytri( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2,CSYTRI2) +void LAPACK_csytri2( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2,DSYTRI2) +void LAPACK_dsytri2( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int const* ipiv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2,SSYTRI2) +void LAPACK_ssytri2( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int const* ipiv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2,ZSYTRI2) +void LAPACK_zsytri2( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x,CSYTRI2X) +void LAPACK_csytri2x( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* work, lapack_int const* nb, + lapack_int* info ); + +#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x,DSYTRI2X) +void LAPACK_dsytri2x( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, lapack_int const* ipiv, + double* work, lapack_int const* nb, + lapack_int* info ); + +#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x,SSYTRI2X) +void LAPACK_ssytri2x( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, lapack_int const* ipiv, + float* work, lapack_int const* nb, + lapack_int* info ); + +#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x,ZSYTRI2X) +void LAPACK_zsytri2x( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* work, lapack_int const* nb, + lapack_int* info ); + +#define LAPACK_csytri_3 LAPACK_GLOBAL(csytri_3,CSYTRI_3) +void LAPACK_csytri_3( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* E, lapack_int const* ipiv, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytri_3 LAPACK_GLOBAL(dsytri_3,DSYTRI_3) +void LAPACK_dsytri_3( + char const* uplo, + lapack_int const* n, + double* A, lapack_int const* lda, + double const* E, lapack_int const* ipiv, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytri_3 LAPACK_GLOBAL(ssytri_3,SSYTRI_3) +void LAPACK_ssytri_3( + char const* uplo, + lapack_int const* n, + float* A, lapack_int const* lda, + float const* E, lapack_int const* ipiv, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytri_3 LAPACK_GLOBAL(zsytri_3,ZSYTRI_3) +void LAPACK_zsytri_3( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* E, lapack_int const* ipiv, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytrs LAPACK_GLOBAL(csytrs,CSYTRS) +void LAPACK_csytrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs,DSYTRS) +void LAPACK_dsytrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs,SSYTRS) +void LAPACK_ssytrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs,ZSYTRS) +void LAPACK_zsytrs( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2,CSYTRS2) +void LAPACK_csytrs2( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2,DSYTRS2) +void LAPACK_dsytrs2( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double* A, lapack_int const* lda, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + double* work, + lapack_int* info ); + +#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2,SSYTRS2) +void LAPACK_ssytrs2( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float* A, lapack_int const* lda, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + float* work, + lapack_int* info ); + +#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2,ZSYTRS2) +void LAPACK_zsytrs2( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_csytrs_3 LAPACK_GLOBAL(csytrs_3,CSYTRS_3) +void LAPACK_csytrs_3( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* E, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsytrs_3 LAPACK_GLOBAL(dsytrs_3,DSYTRS_3) +void LAPACK_dsytrs_3( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* E, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ssytrs_3 LAPACK_GLOBAL(ssytrs_3,SSYTRS_3) +void LAPACK_ssytrs_3( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* E, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zsytrs_3 LAPACK_GLOBAL(zsytrs_3,ZSYTRS_3) +void LAPACK_zsytrs_3( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* E, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_csytrs_aa LAPACK_GLOBAL(csytrs_aa,CSYTRS_AA) +void LAPACK_csytrs_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dsytrs_aa LAPACK_GLOBAL(dsytrs_aa,DSYTRS_AA) +void LAPACK_dsytrs_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ssytrs_aa LAPACK_GLOBAL(ssytrs_aa,SSYTRS_AA) +void LAPACK_ssytrs_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zsytrs_aa LAPACK_GLOBAL(zsytrs_aa,ZSYTRS_AA) +void LAPACK_zsytrs_aa( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_csytrs_aa_2stage LAPACK_GLOBAL(csytrs_aa_2stage,CSYTRS_AA_2STAGE) +void LAPACK_csytrs_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsytrs_aa_2stage LAPACK_GLOBAL(dsytrs_aa_2stage,DSYTRS_AA_2STAGE) +void LAPACK_dsytrs_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ssytrs_aa_2stage LAPACK_GLOBAL(ssytrs_aa_2stage,SSYTRS_AA_2STAGE) +void LAPACK_ssytrs_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zsytrs_aa_2stage LAPACK_GLOBAL(zsytrs_aa_2stage,ZSYTRS_AA_2STAGE) +void LAPACK_zsytrs_aa_2stage( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_csytrs_rook LAPACK_GLOBAL(csytrs_rook,CSYTRS_ROOK) +void LAPACK_csytrs_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dsytrs_rook LAPACK_GLOBAL(dsytrs_rook,DSYTRS_ROOK) +void LAPACK_dsytrs_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, lapack_int const* ipiv, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ssytrs_rook LAPACK_GLOBAL(ssytrs_rook,SSYTRS_ROOK) +void LAPACK_ssytrs_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, lapack_int const* ipiv, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_zsytrs_rook LAPACK_GLOBAL(zsytrs_rook,ZSYTRS_ROOK) +void LAPACK_zsytrs_rook( + char const* uplo, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon,CTBCON) +void LAPACK_ctbcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* kd, + lapack_complex_float const* AB, lapack_int const* ldab, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon,DTBCON) +void LAPACK_dtbcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* kd, + double const* AB, lapack_int const* ldab, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_stbcon LAPACK_GLOBAL(stbcon,STBCON) +void LAPACK_stbcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* kd, + float const* AB, lapack_int const* ldab, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon,ZTBCON) +void LAPACK_ztbcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, lapack_int const* kd, + lapack_complex_double const* AB, lapack_int const* ldab, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs,CTBRFS) +void LAPACK_ctbrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_float const* AB, lapack_int const* ldab, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float const* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs,DTBRFS) +void LAPACK_dtbrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + double const* AB, lapack_int const* ldab, + double const* B, lapack_int const* ldb, + double const* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs,STBRFS) +void LAPACK_stbrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + float const* AB, lapack_int const* ldab, + float const* B, lapack_int const* ldb, + float const* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs,ZTBRFS) +void LAPACK_ztbrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_double const* AB, lapack_int const* ldab, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double const* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs,CTBTRS) +void LAPACK_ctbtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_float const* AB, lapack_int const* ldab, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs,DTBTRS) +void LAPACK_dtbtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + double const* AB, lapack_int const* ldab, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs,STBTRS) +void LAPACK_stbtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + float const* AB, lapack_int const* ldab, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs,ZTBTRS) +void LAPACK_ztbtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs, + lapack_complex_double const* AB, lapack_int const* ldab, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm,CTFSM) +void LAPACK_ctfsm( + char const* transr, char const* side, char const* uplo, char const* trans, char const* diag, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* alpha, + lapack_complex_float const* A, + lapack_complex_float* B, lapack_int const* ldb ); + +#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm,DTFSM) +void LAPACK_dtfsm( + char const* transr, char const* side, char const* uplo, char const* trans, char const* diag, + lapack_int const* m, lapack_int const* n, + double const* alpha, + double const* A, + double* B, lapack_int const* ldb ); + +#define LAPACK_stfsm LAPACK_GLOBAL(stfsm,STFSM) +void LAPACK_stfsm( + char const* transr, char const* side, char const* uplo, char const* trans, char const* diag, + lapack_int const* m, lapack_int const* n, + float const* alpha, + float const* A, + float* B, lapack_int const* ldb ); + +#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm,ZTFSM) +void LAPACK_ztfsm( + char const* transr, char const* side, char const* uplo, char const* trans, char const* diag, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* alpha, + lapack_complex_double const* A, + lapack_complex_double* B, lapack_int const* ldb ); + +#define LAPACK_ctftri LAPACK_GLOBAL(ctftri,CTFTRI) +void LAPACK_ctftri( + char const* transr, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_float* A, + lapack_int* info ); + +#define LAPACK_dtftri LAPACK_GLOBAL(dtftri,DTFTRI) +void LAPACK_dtftri( + char const* transr, char const* uplo, char const* diag, + lapack_int const* n, + double* A, + lapack_int* info ); + +#define LAPACK_stftri LAPACK_GLOBAL(stftri,STFTRI) +void LAPACK_stftri( + char const* transr, char const* uplo, char const* diag, + lapack_int const* n, + float* A, + lapack_int* info ); + +#define LAPACK_ztftri LAPACK_GLOBAL(ztftri,ZTFTRI) +void LAPACK_ztftri( + char const* transr, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_double* A, + lapack_int* info ); + +#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp,CTFTTP) +void LAPACK_ctfttp( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_float const* ARF, + lapack_complex_float* AP, + lapack_int* info ); + +#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp,DTFTTP) +void LAPACK_dtfttp( + char const* transr, char const* uplo, + lapack_int const* n, + double const* ARF, + double* AP, + lapack_int* info ); + +#define LAPACK_stfttp LAPACK_GLOBAL(stfttp,STFTTP) +void LAPACK_stfttp( + char const* transr, char const* uplo, + lapack_int const* n, + float const* ARF, + float* AP, + lapack_int* info ); + +#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp,ZTFTTP) +void LAPACK_ztfttp( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_double const* ARF, + lapack_complex_double* AP, + lapack_int* info ); + +#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr,CTFTTR) +void LAPACK_ctfttr( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_float const* ARF, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr,DTFTTR) +void LAPACK_dtfttr( + char const* transr, char const* uplo, + lapack_int const* n, + double const* ARF, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_stfttr LAPACK_GLOBAL(stfttr,STFTTR) +void LAPACK_stfttr( + char const* transr, char const* uplo, + lapack_int const* n, + float const* ARF, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr,ZTFTTR) +void LAPACK_ztfttr( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_double const* ARF, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc,CTGEVC) +void LAPACK_ctgevc( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_float const* S, lapack_int const* lds, + lapack_complex_float const* P, lapack_int const* ldp, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc,DTGEVC) +void LAPACK_dtgevc( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + double const* S, lapack_int const* lds, + double const* P, lapack_int const* ldp, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + double* work, + lapack_int* info ); + +#define LAPACK_stgevc LAPACK_GLOBAL(stgevc,STGEVC) +void LAPACK_stgevc( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + float const* S, lapack_int const* lds, + float const* P, lapack_int const* ldp, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + float* work, + lapack_int* info ); + +#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc,ZTGEVC) +void LAPACK_ztgevc( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_double const* S, lapack_int const* lds, + lapack_complex_double const* P, lapack_int const* ldp, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc,CTGEXC) +void LAPACK_ctgexc( + lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* Z, lapack_int const* ldz, lapack_int const* ifst, lapack_int* ilst, + lapack_int* info ); + +#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc,DTGEXC) +void LAPACK_dtgexc( + lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* Q, lapack_int const* ldq, + double* Z, lapack_int const* ldz, lapack_int* ifst, lapack_int* ilst, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_stgexc LAPACK_GLOBAL(stgexc,STGEXC) +void LAPACK_stgexc( + lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* Q, lapack_int const* ldq, + float* Z, lapack_int const* ldz, lapack_int* ifst, lapack_int* ilst, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc,ZTGEXC) +void LAPACK_ztgexc( + lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* Z, lapack_int const* ldz, lapack_int const* ifst, lapack_int* ilst, + lapack_int* info ); + +#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen,CTGSEN) +void LAPACK_ctgsen( + lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* alpha, + lapack_complex_float* beta, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* Z, lapack_int const* ldz, lapack_int* m, + float* pl, + float* pr, + float* DIF, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen,DTGSEN) +void LAPACK_dtgsen( + lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* alphar, + double* alphai, + double* beta, + double* Q, lapack_int const* ldq, + double* Z, lapack_int const* ldz, lapack_int* m, + double* pl, + double* pr, + double* DIF, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_stgsen LAPACK_GLOBAL(stgsen,STGSEN) +void LAPACK_stgsen( + lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* alphar, + float* alphai, + float* beta, + float* Q, lapack_int const* ldq, + float* Z, lapack_int const* ldz, lapack_int* m, + float* pl, + float* pr, + float* DIF, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen,ZTGSEN) +void LAPACK_ztgsen( + lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* alpha, + lapack_complex_double* beta, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* Z, lapack_int const* ldz, lapack_int* m, + double* pl, + double* pr, + double* DIF, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja,CTGSJA) +void LAPACK_ctgsja( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + float const* tola, + float const* tolb, + float* alpha, + float* beta, + lapack_complex_float* U, lapack_int const* ldu, + lapack_complex_float* V, lapack_int const* ldv, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* work, lapack_int* ncycle, + lapack_int* info ); + +#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja,DTGSJA) +void LAPACK_dtgsja( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double const* tola, + double const* tolb, + double* alpha, + double* beta, + double* U, lapack_int const* ldu, + double* V, lapack_int const* ldv, + double* Q, lapack_int const* ldq, + double* work, lapack_int* ncycle, + lapack_int* info ); + +#define LAPACK_stgsja LAPACK_GLOBAL(stgsja,STGSJA) +void LAPACK_stgsja( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float const* tola, + float const* tolb, + float* alpha, + float* beta, + float* U, lapack_int const* ldu, + float* V, lapack_int const* ldv, + float* Q, lapack_int const* ldq, + float* work, lapack_int* ncycle, + lapack_int* info ); + +#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja,ZTGSJA) +void LAPACK_ztgsja( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + double const* tola, + double const* tolb, + double* alpha, + double* beta, + lapack_complex_double* U, lapack_int const* ldu, + lapack_complex_double* V, lapack_int const* ldv, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* work, lapack_int* ncycle, + lapack_int* info ); + +#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna,CTGSNA) +void LAPACK_ctgsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float const* VL, lapack_int const* ldvl, + lapack_complex_float const* VR, lapack_int const* ldvr, + float* S, + float* DIF, lapack_int const* mm, lapack_int* m, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna,DTGSNA) +void LAPACK_dtgsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double const* VL, lapack_int const* ldvl, + double const* VR, lapack_int const* ldvr, + double* S, + double* DIF, lapack_int const* mm, lapack_int* m, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_stgsna LAPACK_GLOBAL(stgsna,STGSNA) +void LAPACK_stgsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float const* VL, lapack_int const* ldvl, + float const* VR, lapack_int const* ldvr, + float* S, + float* DIF, lapack_int const* mm, lapack_int* m, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna,ZTGSNA) +void LAPACK_ztgsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double const* VL, lapack_int const* ldvl, + lapack_complex_double const* VR, lapack_int const* ldvr, + double* S, + double* DIF, lapack_int const* mm, lapack_int* m, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl,CTGSYL) +void LAPACK_ctgsyl( + char const* trans, + lapack_int const* ijob, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float const* D, lapack_int const* ldd, + lapack_complex_float const* E, lapack_int const* lde, + lapack_complex_float* F, lapack_int const* ldf, + float* dif, + float* scale, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl,DTGSYL) +void LAPACK_dtgsyl( + char const* trans, + lapack_int const* ijob, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, + double const* D, lapack_int const* ldd, + double const* E, lapack_int const* lde, + double* F, lapack_int const* ldf, + double* dif, + double* scale, + double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl,STGSYL) +void LAPACK_stgsyl( + char const* trans, + lapack_int const* ijob, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, + float const* D, lapack_int const* ldd, + float const* E, lapack_int const* lde, + float* F, lapack_int const* ldf, + float* dif, + float* scale, + float* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl,ZTGSYL) +void LAPACK_ztgsyl( + char const* trans, + lapack_int const* ijob, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double const* D, lapack_int const* ldd, + lapack_complex_double const* E, lapack_int const* lde, + lapack_complex_double* F, lapack_int const* ldf, + double* dif, + double* scale, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon,CTPCON) +void LAPACK_ctpcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_float const* AP, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon,DTPCON) +void LAPACK_dtpcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + double const* AP, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_stpcon LAPACK_GLOBAL(stpcon,STPCON) +void LAPACK_stpcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + float const* AP, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon,ZTPCON) +void LAPACK_ztpcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_double const* AP, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctplqt LAPACK_GLOBAL(ctplqt,CTPLQT) +void LAPACK_ctplqt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dtplqt LAPACK_GLOBAL(dtplqt,DTPLQT) +void LAPACK_dtplqt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* T, lapack_int const* ldt, + double* work, + lapack_int* info ); + +#define LAPACK_stplqt LAPACK_GLOBAL(stplqt,STPLQT) +void LAPACK_stplqt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* T, lapack_int const* ldt, + float* work, + lapack_int* info ); + +#define LAPACK_ztplqt LAPACK_GLOBAL(ztplqt,ZTPLQT) +void LAPACK_ztplqt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_ctplqt2 LAPACK_GLOBAL(ctplqt2,CTPLQT2) +void LAPACK_ctplqt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_dtplqt2 LAPACK_GLOBAL(dtplqt2,DTPLQT2) +void LAPACK_dtplqt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_stplqt2 LAPACK_GLOBAL(stplqt2,STPLQT2) +void LAPACK_stplqt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_ztplqt2 LAPACK_GLOBAL(ztplqt2,ZTPLQT2) +void LAPACK_ztplqt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_ctpmlqt LAPACK_GLOBAL(ctpmlqt,CTPMLQT) +void LAPACK_ctpmlqt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb, + lapack_complex_float const* V, lapack_int const* ldv, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dtpmlqt LAPACK_GLOBAL(dtpmlqt,DTPMLQT) +void LAPACK_dtpmlqt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb, + double const* V, lapack_int const* ldv, + double const* T, lapack_int const* ldt, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* work, + lapack_int* info ); + +#define LAPACK_stpmlqt LAPACK_GLOBAL(stpmlqt,STPMLQT) +void LAPACK_stpmlqt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb, + float const* V, lapack_int const* ldv, + float const* T, lapack_int const* ldt, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* work, + lapack_int* info ); + +#define LAPACK_ztpmlqt LAPACK_GLOBAL(ztpmlqt,ZTPMLQT) +void LAPACK_ztpmlqt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb, + lapack_complex_double const* V, lapack_int const* ldv, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt,CTPMQRT) +void LAPACK_ctpmqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb, + lapack_complex_float const* V, lapack_int const* ldv, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt,DTPMQRT) +void LAPACK_dtpmqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb, + double const* V, lapack_int const* ldv, + double const* T, lapack_int const* ldt, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* work, + lapack_int* info ); + +#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt,STPMQRT) +void LAPACK_stpmqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb, + float const* V, lapack_int const* ldv, + float const* T, lapack_int const* ldt, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* work, + lapack_int* info ); + +#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt,ZTPMQRT) +void LAPACK_ztpmqrt( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb, + lapack_complex_double const* V, lapack_int const* ldv, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt,CTPQRT) +void LAPACK_ctpqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt,DTPQRT) +void LAPACK_dtpqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* T, lapack_int const* ldt, + double* work, + lapack_int* info ); + +#define LAPACK_stpqrt LAPACK_GLOBAL(stpqrt,STPQRT) +void LAPACK_stpqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* T, lapack_int const* ldt, + float* work, + lapack_int* info ); + +#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt,ZTPQRT) +void LAPACK_ztpqrt( + lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2,CTPQRT2) +void LAPACK_ctpqrt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2,DTPQRT2) +void LAPACK_dtpqrt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2,STPQRT2) +void LAPACK_stpqrt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2,ZTPQRT2) +void LAPACK_ztpqrt2( + lapack_int const* m, lapack_int const* n, lapack_int const* l, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* T, lapack_int const* ldt, + lapack_int* info ); + +#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb,CTPRFB) +void LAPACK_ctprfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + lapack_complex_float const* V, lapack_int const* ldv, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_complex_float* work, lapack_int const* ldwork ); + +#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb,DTPRFB) +void LAPACK_dtprfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + double const* V, lapack_int const* ldv, + double const* T, lapack_int const* ldt, + double* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + double* work, lapack_int const* ldwork ); + +#define LAPACK_stprfb LAPACK_GLOBAL(stprfb,STPRFB) +void LAPACK_stprfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + float const* V, lapack_int const* ldv, + float const* T, lapack_int const* ldt, + float* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + float* work, lapack_int const* ldwork ); + +#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb,ZTPRFB) +void LAPACK_ztprfb( + char const* side, char const* trans, char const* direct, char const* storev, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + lapack_complex_double const* V, lapack_int const* ldv, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_complex_double* work, lapack_int const* ldwork ); + +#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs,CTPRFS) +void LAPACK_ctprfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float const* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs,DTPRFS) +void LAPACK_dtprfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + double const* AP, + double const* B, lapack_int const* ldb, + double const* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_stprfs LAPACK_GLOBAL(stprfs,STPRFS) +void LAPACK_stprfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + float const* AP, + float const* B, lapack_int const* ldb, + float const* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs,ZTPRFS) +void LAPACK_ztprfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double const* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctptri LAPACK_GLOBAL(ctptri,CTPTRI) +void LAPACK_ctptri( + char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_float* AP, + lapack_int* info ); + +#define LAPACK_dtptri LAPACK_GLOBAL(dtptri,DTPTRI) +void LAPACK_dtptri( + char const* uplo, char const* diag, + lapack_int const* n, + double* AP, + lapack_int* info ); + +#define LAPACK_stptri LAPACK_GLOBAL(stptri,STPTRI) +void LAPACK_stptri( + char const* uplo, char const* diag, + lapack_int const* n, + float* AP, + lapack_int* info ); + +#define LAPACK_ztptri LAPACK_GLOBAL(ztptri,ZTPTRI) +void LAPACK_ztptri( + char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_double* AP, + lapack_int* info ); + +#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs,CTPTRS) +void LAPACK_ctptrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* AP, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs,DTPTRS) +void LAPACK_dtptrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + double const* AP, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_stptrs LAPACK_GLOBAL(stptrs,STPTRS) +void LAPACK_stptrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + float const* AP, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs,ZTPTRS) +void LAPACK_ztptrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* AP, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf,CTPTTF) +void LAPACK_ctpttf( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, + lapack_complex_float* ARF, + lapack_int* info ); + +#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf,DTPTTF) +void LAPACK_dtpttf( + char const* transr, char const* uplo, + lapack_int const* n, + double const* AP, + double* ARF, + lapack_int* info ); + +#define LAPACK_stpttf LAPACK_GLOBAL(stpttf,STPTTF) +void LAPACK_stpttf( + char const* transr, char const* uplo, + lapack_int const* n, + float const* AP, + float* ARF, + lapack_int* info ); + +#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf,ZTPTTF) +void LAPACK_ztpttf( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, + lapack_complex_double* ARF, + lapack_int* info ); + +#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr,CTPTTR) +void LAPACK_ctpttr( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr,DTPTTR) +void LAPACK_dtpttr( + char const* uplo, + lapack_int const* n, + double const* AP, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_stpttr LAPACK_GLOBAL(stpttr,STPTTR) +void LAPACK_stpttr( + char const* uplo, + lapack_int const* n, + float const* AP, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr,ZTPTTR) +void LAPACK_ztpttr( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon,CTRCON) +void LAPACK_ctrcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + float* rcond, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon,DTRCON) +void LAPACK_dtrcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + double const* A, lapack_int const* lda, + double* rcond, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_strcon LAPACK_GLOBAL(strcon,STRCON) +void LAPACK_strcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + float const* A, lapack_int const* lda, + float* rcond, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon,ZTRCON) +void LAPACK_ztrcon( + char const* norm, char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + double* rcond, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc,CTREVC) +void LAPACK_ctrevc( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc,DTREVC) +void LAPACK_dtrevc( + char const* side, char const* howmny, + lapack_logical* select, + lapack_int const* n, + double const* T, lapack_int const* ldt, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + double* work, + lapack_int* info ); + +#define LAPACK_strevc LAPACK_GLOBAL(strevc,STREVC) +void LAPACK_strevc( + char const* side, char const* howmny, + lapack_logical* select, + lapack_int const* n, + float const* T, lapack_int const* ldt, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + float* work, + lapack_int* info ); + +#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc,ZTREVC) +void LAPACK_ztrevc( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctrevc3 LAPACK_GLOBAL(ctrevc3,CTREVC3) +void LAPACK_ctrevc3( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* VL, lapack_int const* ldvl, + lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_dtrevc3 LAPACK_GLOBAL(dtrevc3,DTREVC3) +void LAPACK_dtrevc3( + char const* side, char const* howmny, + lapack_logical* select, + lapack_int const* n, + double const* T, lapack_int const* ldt, + double* VL, lapack_int const* ldvl, + double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_strevc3 LAPACK_GLOBAL(strevc3,STREVC3) +void LAPACK_strevc3( + char const* side, char const* howmny, + lapack_logical* select, + lapack_int const* n, + float const* T, lapack_int const* ldt, + float* VL, lapack_int const* ldvl, + float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ztrevc3 LAPACK_GLOBAL(ztrevc3,ZTREVC3) +void LAPACK_ztrevc3( + char const* side, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* VL, lapack_int const* ldvl, + lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* info ); + +#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc,CTREXC) +void LAPACK_ctrexc( + char const* compq, + lapack_int const* n, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* Q, lapack_int const* ldq, lapack_int const* ifst, lapack_int const* ilst, + lapack_int* info ); + +#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc,DTREXC) +void LAPACK_dtrexc( + char const* compq, + lapack_int const* n, + double* T, lapack_int const* ldt, + double* Q, lapack_int const* ldq, lapack_int* ifst, lapack_int* ilst, + double* work, + lapack_int* info ); + +#define LAPACK_strexc LAPACK_GLOBAL(strexc,STREXC) +void LAPACK_strexc( + char const* compq, + lapack_int const* n, + float* T, lapack_int const* ldt, + float* Q, lapack_int const* ldq, lapack_int* ifst, lapack_int* ilst, + float* work, + lapack_int* info ); + +#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc,ZTREXC) +void LAPACK_ztrexc( + char const* compq, + lapack_int const* n, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* Q, lapack_int const* ldq, lapack_int const* ifst, lapack_int const* ilst, + lapack_int* info ); + +#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs,CTRRFS) +void LAPACK_ctrrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float const* X, lapack_int const* ldx, + float* ferr, + float* berr, + lapack_complex_float* work, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs,DTRRFS) +void LAPACK_dtrrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double const* X, lapack_int const* ldx, + double* ferr, + double* berr, + double* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_strrfs LAPACK_GLOBAL(strrfs,STRRFS) +void LAPACK_strrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float const* X, lapack_int const* ldx, + float* ferr, + float* berr, + float* work, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs,ZTRRFS) +void LAPACK_ztrrfs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double const* X, lapack_int const* ldx, + double* ferr, + double* berr, + lapack_complex_double* work, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen,CTRSEN) +void LAPACK_ctrsen( + char const* job, char const* compq, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* W, lapack_int* m, + float* s, + float* sep, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen,DTRSEN) +void LAPACK_dtrsen( + char const* job, char const* compq, + lapack_logical const* select, + lapack_int const* n, + double* T, lapack_int const* ldt, + double* Q, lapack_int const* ldq, + double* WR, + double* WI, lapack_int* m, + double* s, + double* sep, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_strsen LAPACK_GLOBAL(strsen,STRSEN) +void LAPACK_strsen( + char const* job, char const* compq, + lapack_logical const* select, + lapack_int const* n, + float* T, lapack_int const* ldt, + float* Q, lapack_int const* ldq, + float* WR, + float* WI, lapack_int* m, + float* s, + float* sep, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen,ZTRSEN) +void LAPACK_ztrsen( + char const* job, char const* compq, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* W, lapack_int* m, + double* s, + double* sep, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna,CTRSNA) +void LAPACK_ctrsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float const* VL, lapack_int const* ldvl, + lapack_complex_float const* VR, lapack_int const* ldvr, + float* S, + float* SEP, lapack_int const* mm, lapack_int* m, + lapack_complex_float* work, lapack_int const* ldwork, + float* rwork, + lapack_int* info ); + +#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna,DTRSNA) +void LAPACK_dtrsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + double const* T, lapack_int const* ldt, + double const* VL, lapack_int const* ldvl, + double const* VR, lapack_int const* ldvr, + double* S, + double* SEP, lapack_int const* mm, lapack_int* m, + double* work, lapack_int const* ldwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_strsna LAPACK_GLOBAL(strsna,STRSNA) +void LAPACK_strsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + float const* T, lapack_int const* ldt, + float const* VL, lapack_int const* ldvl, + float const* VR, lapack_int const* ldvr, + float* S, + float* SEP, lapack_int const* mm, lapack_int* m, + float* work, lapack_int const* ldwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna,ZTRSNA) +void LAPACK_ztrsna( + char const* job, char const* howmny, + lapack_logical const* select, + lapack_int const* n, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double const* VL, lapack_int const* ldvl, + lapack_complex_double const* VR, lapack_int const* ldvr, + double* S, + double* SEP, lapack_int const* mm, lapack_int* m, + lapack_complex_double* work, lapack_int const* ldwork, + double* rwork, + lapack_int* info ); + +#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl,CTRSYL) +void LAPACK_ctrsyl( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, + float* scale, + lapack_int* info ); + +#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl,DTRSYL) +void LAPACK_dtrsyl( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, + double* scale, + lapack_int* info ); + +#define LAPACK_strsyl LAPACK_GLOBAL(strsyl,STRSYL) +void LAPACK_strsyl( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, + float* scale, + lapack_int* info ); + +#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl,ZTRSYL) +void LAPACK_ztrsyl( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, + double* scale, + lapack_int* info ); + +#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri,CTRTRI) +void LAPACK_ctrtri( + char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri,DTRTRI) +void LAPACK_dtrtri( + char const* uplo, char const* diag, + lapack_int const* n, + double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_strtri LAPACK_GLOBAL(strtri,STRTRI) +void LAPACK_strtri( + char const* uplo, char const* diag, + lapack_int const* n, + float* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri,ZTRTRI) +void LAPACK_ztrtri( + char const* uplo, char const* diag, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_int* info ); + +#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs,CTRTRS) +void LAPACK_ctrtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs,DTRTRS) +void LAPACK_dtrtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + double const* A, lapack_int const* lda, + double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_strtrs LAPACK_GLOBAL(strtrs,STRTRS) +void LAPACK_strtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + float const* A, lapack_int const* lda, + float* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs,ZTRTRS) +void LAPACK_ztrtrs( + char const* uplo, char const* trans, char const* diag, + lapack_int const* n, lapack_int const* nrhs, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* B, lapack_int const* ldb, + lapack_int* info ); + +#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf,CTRTTF) +void LAPACK_ctrttf( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* ARF, + lapack_int* info ); + +#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf,DTRTTF) +void LAPACK_dtrttf( + char const* transr, char const* uplo, + lapack_int const* n, + double const* A, lapack_int const* lda, + double* ARF, + lapack_int* info ); + +#define LAPACK_strttf LAPACK_GLOBAL(strttf,STRTTF) +void LAPACK_strttf( + char const* transr, char const* uplo, + lapack_int const* n, + float const* A, lapack_int const* lda, + float* ARF, + lapack_int* info ); + +#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf,ZTRTTF) +void LAPACK_ztrttf( + char const* transr, char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* ARF, + lapack_int* info ); + +#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp,CTRTTP) +void LAPACK_ctrttp( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float* AP, + lapack_int* info ); + +#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp,DTRTTP) +void LAPACK_dtrttp( + char const* uplo, + lapack_int const* n, + double const* A, lapack_int const* lda, + double* AP, + lapack_int* info ); + +#define LAPACK_strttp LAPACK_GLOBAL(strttp,STRTTP) +void LAPACK_strttp( + char const* uplo, + lapack_int const* n, + float const* A, lapack_int const* lda, + float* AP, + lapack_int* info ); + +#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp,ZTRTTP) +void LAPACK_ztrttp( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double* AP, + lapack_int* info ); + +#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf,CTZRZF) +void LAPACK_ctzrzf( + lapack_int const* m, lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf,DTZRZF) +void LAPACK_dtzrzf( + lapack_int const* m, lapack_int const* n, + double* A, lapack_int const* lda, + double* tau, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf,STZRZF) +void LAPACK_stzrzf( + lapack_int const* m, lapack_int const* n, + float* A, lapack_int const* lda, + float* tau, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf,ZTZRZF) +void LAPACK_ztzrzf( + lapack_int const* m, lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb,CUNBDB) +void LAPACK_cunbdb( + char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + lapack_complex_float* X11, lapack_int const* ldx11, + lapack_complex_float* X12, lapack_int const* ldx12, + lapack_complex_float* X21, lapack_int const* ldx21, + lapack_complex_float* X22, lapack_int const* ldx22, + float* theta, + float* phi, + lapack_complex_float* TAUP1, + lapack_complex_float* TAUP2, + lapack_complex_float* TAUQ1, + lapack_complex_float* TAUQ2, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb,ZUNBDB) +void LAPACK_zunbdb( + char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + lapack_complex_double* X11, lapack_int const* ldx11, + lapack_complex_double* X12, lapack_int const* ldx12, + lapack_complex_double* X21, lapack_int const* ldx21, + lapack_complex_double* X22, lapack_int const* ldx22, + double* theta, + double* phi, + lapack_complex_double* TAUP1, + lapack_complex_double* TAUP2, + lapack_complex_double* TAUQ1, + lapack_complex_double* TAUQ2, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd,CUNCSD) +void LAPACK_cuncsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + lapack_complex_float* X11, lapack_int const* ldx11, + lapack_complex_float* X12, lapack_int const* ldx12, + lapack_complex_float* X21, lapack_int const* ldx21, + lapack_complex_float* X22, lapack_int const* ldx22, + float* theta, + lapack_complex_float* U1, lapack_int const* ldu1, + lapack_complex_float* U2, lapack_int const* ldu2, + lapack_complex_float* V1T, lapack_int const* ldv1t, + lapack_complex_float* V2T, lapack_int const* ldv2t, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd,ZUNCSD) +void LAPACK_zuncsd( + char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + lapack_complex_double* X11, lapack_int const* ldx11, + lapack_complex_double* X12, lapack_int const* ldx12, + lapack_complex_double* X21, lapack_int const* ldx21, + lapack_complex_double* X22, lapack_int const* ldx22, + double* theta, + lapack_complex_double* U1, lapack_int const* ldu1, + lapack_complex_double* U2, lapack_int const* ldu2, + lapack_complex_double* V1T, lapack_int const* ldv1t, + lapack_complex_double* V2T, lapack_int const* ldv2t, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cuncsd2by1 LAPACK_GLOBAL(cuncsd2by1,CUNCSD2BY1) +void LAPACK_cuncsd2by1( + char const* jobu1, char const* jobu2, char const* jobv1t, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + lapack_complex_float* X11, lapack_int const* ldx11, + lapack_complex_float* X21, lapack_int const* ldx21, + float* theta, + lapack_complex_float* U1, lapack_int const* ldu1, + lapack_complex_float* U2, lapack_int const* ldu2, + lapack_complex_float* V1T, lapack_int const* ldv1t, + lapack_complex_float* work, lapack_int const* lwork, + float* rwork, lapack_int const* lrwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_zuncsd2by1 LAPACK_GLOBAL(zuncsd2by1,ZUNCSD2BY1) +void LAPACK_zuncsd2by1( + char const* jobu1, char const* jobu2, char const* jobv1t, + lapack_int const* m, lapack_int const* p, lapack_int const* q, + lapack_complex_double* X11, lapack_int const* ldx11, + lapack_complex_double* X21, lapack_int const* ldx21, + double* theta, + lapack_complex_double* U1, lapack_int const* ldu1, + lapack_complex_double* U2, lapack_int const* ldu2, + lapack_complex_double* V1T, lapack_int const* ldv1t, + lapack_complex_double* work, lapack_int const* lwork, + double* rwork, lapack_int const* lrwork, + lapack_int* iwork, + lapack_int* info ); + +#define LAPACK_cungbr LAPACK_GLOBAL(cungbr,CUNGBR) +void LAPACK_cungbr( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungbr LAPACK_GLOBAL(zungbr,ZUNGBR) +void LAPACK_zungbr( + char const* vect, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunghr LAPACK_GLOBAL(cunghr,CUNGHR) +void LAPACK_cunghr( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunghr LAPACK_GLOBAL(zunghr,ZUNGHR) +void LAPACK_zunghr( + lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunglq LAPACK_GLOBAL(cunglq,CUNGLQ) +void LAPACK_cunglq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunglq LAPACK_GLOBAL(zunglq,ZUNGLQ) +void LAPACK_zunglq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cungql LAPACK_GLOBAL(cungql,CUNGQL) +void LAPACK_cungql( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungql LAPACK_GLOBAL(zungql,ZUNGQL) +void LAPACK_zungql( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cungqr LAPACK_GLOBAL(cungqr,CUNGQR) +void LAPACK_cungqr( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungqr LAPACK_GLOBAL(zungqr,ZUNGQR) +void LAPACK_zungqr( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cungrq LAPACK_GLOBAL(cungrq,CUNGRQ) +void LAPACK_cungrq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungrq LAPACK_GLOBAL(zungrq,ZUNGRQ) +void LAPACK_zungrq( + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cungtr LAPACK_GLOBAL(cungtr,CUNGTR) +void LAPACK_cungtr( + char const* uplo, + lapack_int const* n, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungtr LAPACK_GLOBAL(zungtr,ZUNGTR) +void LAPACK_zungtr( + char const* uplo, + lapack_int const* n, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR) +void LAPACK_cunmbr( + char const* vect, char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr,ZUNMBR) +void LAPACK_zunmbr( + char const* vect, char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr,CUNMHR) +void LAPACK_cunmhr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr,ZUNMHR) +void LAPACK_zunmhr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq,CUNMLQ) +void LAPACK_cunmlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq,ZUNMLQ) +void LAPACK_zunmlq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmql LAPACK_GLOBAL(cunmql,CUNMQL) +void LAPACK_cunmql( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmql LAPACK_GLOBAL(zunmql,ZUNMQL) +void LAPACK_zunmql( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr,CUNMQR) +void LAPACK_cunmqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr,ZUNMQR) +void LAPACK_zunmqr( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq,CUNMRQ) +void LAPACK_cunmrq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq,ZUNMRQ) +void LAPACK_zunmrq( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz,CUNMRZ) +void LAPACK_cunmrz( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz,ZUNMRZ) +void LAPACK_zunmrz( + char const* side, char const* trans, + lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr,CUNMTR) +void LAPACK_cunmtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr,ZUNMTR) +void LAPACK_zunmtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr,CUPGTR) +void LAPACK_cupgtr( + char const* uplo, + lapack_int const* n, + lapack_complex_float const* AP, + lapack_complex_float const* tau, + lapack_complex_float* Q, lapack_int const* ldq, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr,ZUPGTR) +void LAPACK_zupgtr( + char const* uplo, + lapack_int const* n, + lapack_complex_double const* AP, + lapack_complex_double const* tau, + lapack_complex_double* Q, lapack_int const* ldq, + lapack_complex_double* work, + lapack_int* info ); + +#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr,CUPMTR) +void LAPACK_cupmtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + lapack_complex_float const* AP, + lapack_complex_float const* tau, + lapack_complex_float* C, lapack_int const* ldc, + lapack_complex_float* work, + lapack_int* info ); + +#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr,ZUPMTR) +void LAPACK_zupmtr( + char const* side, char const* uplo, char const* trans, + lapack_int const* m, lapack_int const* n, + lapack_complex_double const* AP, + lapack_complex_double const* tau, + lapack_complex_double* C, lapack_int const* ldc, + lapack_complex_double* work, + lapack_int* info ); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LAPACK_H */ diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index c5ea465e0..6eb0b696b 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -34,81 +34,7 @@ #ifndef _LAPACKE_H_ #define _LAPACKE_H_ -/* -* Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes -*/ -#ifdef HAVE_LAPACK_CONFIG_H -#include "lapacke_config.h" -#endif - -#include - -#ifndef lapack_int -#define lapack_int int -#endif - -#ifndef lapack_logical -#define lapack_logical lapack_int -#endif - -/* Complex types are structures equivalent to the -* Fortran complex types COMPLEX(4) and COMPLEX(8). -* -* One can also redefine the types with his own types -* for example by including in the code definitions like -* -* #define lapack_complex_float std::complex -* #define lapack_complex_double std::complex -* -* or define these types in the command line: -* -* -Dlapack_complex_float="std::complex" -* -Dlapack_complex_double="std::complex" -*/ - -#ifndef LAPACK_COMPLEX_CUSTOM - -/* Complex type (single precision) */ -#ifndef lapack_complex_float -#ifndef __cplusplus -#include -#else -#include -#endif -#define lapack_complex_float float _Complex -#endif - -#ifndef lapack_complex_float_real -#define lapack_complex_float_real(z) (creal(z)) -#endif - -#ifndef lapack_complex_float_imag -#define lapack_complex_float_imag(z) (cimag(z)) -#endif - -lapack_complex_float lapack_make_complex_float( float re, float im ); - -/* Complex type (double precision) */ -#ifndef lapack_complex_double -#ifndef __cplusplus -#include -#else -#include -#endif -#define lapack_complex_double double _Complex -#endif - -#ifndef lapack_complex_double_real -#define lapack_complex_double_real(z) (creal(z)) -#endif - -#ifndef lapack_complex_double_imag -#define lapack_complex_double_imag(z) (cimag(z)) -#endif - -lapack_complex_double lapack_make_complex_double( double re, double im ); - -#endif +#include "lapack.h" #ifdef __cplusplus extern "C" { @@ -130,29 +56,8 @@ extern "C" { #define LAPACK_WORK_MEMORY_ERROR -1010 #define LAPACK_TRANSPOSE_MEMORY_ERROR -1011 -/* Callback logical functions of one, two, or three arguments are used -* to select eigenvalues to sort to the top left of the Schur form. -* The value is selected if function returns TRUE (non-zero). */ - -typedef lapack_logical (*LAPACK_S_SELECT2) ( const float*, const float* ); -typedef lapack_logical (*LAPACK_S_SELECT3) - ( const float*, const float*, const float* ); -typedef lapack_logical (*LAPACK_D_SELECT2) ( const double*, const double* ); -typedef lapack_logical (*LAPACK_D_SELECT3) - ( const double*, const double*, const double* ); - -typedef lapack_logical (*LAPACK_C_SELECT1) ( const lapack_complex_float* ); -typedef lapack_logical (*LAPACK_C_SELECT2) - ( const lapack_complex_float*, const lapack_complex_float* ); -typedef lapack_logical (*LAPACK_Z_SELECT1) ( const lapack_complex_double* ); -typedef lapack_logical (*LAPACK_Z_SELECT2) - ( const lapack_complex_double*, const lapack_complex_double* ); - -#include "lapacke_mangling.h" - -#define LAPACK_lsame LAPACK_GLOBAL(lsame,LSAME) -lapack_logical LAPACK_lsame( char* ca, char* cb, - lapack_int lca, lapack_int lcb ); +lapack_complex_float lapack_make_complex_float( float re, float im ); +lapack_complex_double lapack_make_complex_double( double re, double im ); /* C-LAPACK function prototypes */ @@ -1034,6 +939,25 @@ lapack_int LAPACKE_zgesvdx( int matrix_layout, char jobu, char jobvt, char range lapack_complex_double* vt, lapack_int ldvt, lapack_int* superb ); +lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, float* a, lapack_int lda, + float* s, float* u, lapack_int ldu, float* v, + lapack_int ldv, lapack_int* numrank ); +lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, double* a, + lapack_int lda, double* s, double* u, lapack_int ldu, + double* v, lapack_int ldv, lapack_int* numrank); +lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, lapack_complex_float* a, + lapack_int lda, float* s, lapack_complex_float* u, + lapack_int ldu, lapack_complex_float* v, + lapack_int ldv, lapack_int* numrank ); +lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, lapack_complex_double* a, + lapack_int lda, double* s, lapack_complex_double* u, + lapack_int ldu, lapack_complex_double* v, + lapack_int ldv, lapack_int* numrank ); + lapack_int LAPACKE_sgesvj( int matrix_layout, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, lapack_int mv, float* v, lapack_int ldv, @@ -1943,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo, lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, const lapack_complex_float* b, + lapack_int lda, lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, const lapack_complex_double* b, + lapack_int lda, lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, @@ -5824,6 +5748,45 @@ lapack_int LAPACKE_zgesvdx_work( int matrix_layout, char jobu, char jobvt, char lapack_complex_double* work, lapack_int lwork, double* rwork, lapack_int* iwork ); +lapack_int LAPACKE_sgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, float* a, + lapack_int lda, float* s, float* u, + lapack_int ldu, float* v, lapack_int ldv, + lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + float* work, lapack_int lwork, + float* rwork, lapack_int lrwork); +lapack_int LAPACKE_dgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, double* a, + lapack_int lda, double* s, double* u, + lapack_int ldu, double* v, lapack_int ldv, + lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + double* work, lapack_int lwork, + double* rwork, lapack_int lrwork); +lapack_int LAPACKE_cgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, + lapack_complex_float* a, lapack_int lda, + float* s, lapack_complex_float* u, + lapack_int ldu, lapack_complex_float* v, + lapack_int ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + lapack_complex_float* cwork, lapack_int lcwork, + float* rwork, lapack_int lrwork); +lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, + lapack_complex_double* a, lapack_int lda, + double* s, lapack_complex_double* u, + lapack_int ldu, lapack_complex_double* v, + lapack_int ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + lapack_complex_double* cwork, lapack_int lcwork, + double* rwork, lapack_int lrwork); + lapack_int LAPACKE_sgesvj_work( int matrix_layout, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, lapack_int mv, @@ -6969,11 +6932,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range, lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, const lapack_complex_float* b, + lapack_int lda, lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, const lapack_complex_double* b, + lapack_int lda, lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz, @@ -10590,11 +10553,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb ); lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const lapack_complex_float* a, + lapack_int nrhs, lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const lapack_complex_float* a, + lapack_int nrhs, lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ); @@ -10755,10 +10718,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, double* work, lapack_int nb ); lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const double* a, lapack_int lda, + lapack_int nrhs, double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ); lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const double* a, + lapack_int nrhs, double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ); lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10850,10 +10813,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, float* work, lapack_int nb ); lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const float* a, lapack_int lda, + lapack_int nrhs, float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ); lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const float* a, + lapack_int nrhs, float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ); lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10935,11 +10898,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb ); lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const lapack_complex_double* a, + lapack_int nrhs, lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const lapack_complex_double* a, + lapack_int nrhs, lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ); @@ -12609,6848 +12572,6 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i lapack_int lda, lapack_complex_double* tb, lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2, lapack_complex_double* b, lapack_int ldb ); - -#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF) -#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF) -#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF) -#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF) -#define LAPACK_sgetrf2 LAPACK_GLOBAL(sgetrf2,SGETRF2) -#define LAPACK_dgetrf2 LAPACK_GLOBAL(dgetrf2,DGETRF2) -#define LAPACK_cgetrf2 LAPACK_GLOBAL(cgetrf2,CGETRF2) -#define LAPACK_zgetrf2 LAPACK_GLOBAL(zgetrf2,ZGETRF2) -#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf,SGBTRF) -#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf,DGBTRF) -#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf,CGBTRF) -#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf,ZGBTRF) -#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf,SGTTRF) -#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf,DGTTRF) -#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf,CGTTRF) -#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf,ZGTTRF) -#define LAPACK_spotrf LAPACK_GLOBAL(spotrf,SPOTRF) -#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf,DPOTRF) -#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf,CPOTRF) -#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf,ZPOTRF) -#define LAPACK_spotrf2 LAPACK_GLOBAL(spotrf2,SPOTRF2) -#define LAPACK_dpotrf2 LAPACK_GLOBAL(dpotrf2,DPOTRF2) -#define LAPACK_cpotrf2 LAPACK_GLOBAL(cpotrf2,CPOTRF2) -#define LAPACK_zpotrf2 LAPACK_GLOBAL(zpotrf2,ZPOTRF2) -#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf,DPSTRF) -#define LAPACK_spstrf LAPACK_GLOBAL(spstrf,SPSTRF) -#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf,ZPSTRF) -#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf,CPSTRF) -#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf,DPFTRF) -#define LAPACK_spftrf LAPACK_GLOBAL(spftrf,SPFTRF) -#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf,ZPFTRF) -#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf,CPFTRF) -#define LAPACK_spptrf LAPACK_GLOBAL(spptrf,SPPTRF) -#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf,DPPTRF) -#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf,CPPTRF) -#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf,ZPPTRF) -#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf,SPBTRF) -#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf,DPBTRF) -#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf,CPBTRF) -#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf,ZPBTRF) -#define LAPACK_spttrf LAPACK_GLOBAL(spttrf,SPTTRF) -#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf,DPTTRF) -#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf,CPTTRF) -#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf,ZPTTRF) -#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf,SSYTRF) -#define LAPACK_ssytrf_rook LAPACK_GLOBAL(ssytrf_rook,SSYTRF_ROOK) -#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf,DSYTRF) -#define LAPACK_dsytrf_rook LAPACK_GLOBAL(dsytrf_rook,DSYTRF_ROOK) -#define LAPACK_csytrf LAPACK_GLOBAL(csytrf,CSYTRF) -#define LAPACK_csytrf_rook LAPACK_GLOBAL(csytrf_rook,CSYTRF_ROOK) -#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf,ZSYTRF) -#define LAPACK_zsytrf_rook LAPACK_GLOBAL(zsytrf_rook,ZSYTRF_ROOK) -#define LAPACK_chetrf LAPACK_GLOBAL(chetrf,CHETRF) -#define LAPACK_chetrf_rook LAPACK_GLOBAL(chetrf_rook,CHETRF_ROOK) -#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf,ZHETRF) -#define LAPACK_zhetrf_rook LAPACK_GLOBAL(zhetrf_rook,ZHETRF_ROOK) -#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf,SSPTRF) -#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf,DSPTRF) -#define LAPACK_csptrf LAPACK_GLOBAL(csptrf,CSPTRF) -#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf,ZSPTRF) -#define LAPACK_chptrf LAPACK_GLOBAL(chptrf,CHPTRF) -#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf,ZHPTRF) -#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs,SGETRS) -#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs,DGETRS) -#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs,CGETRS) -#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs,ZGETRS) -#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs,SGBTRS) -#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs,DGBTRS) -#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs,CGBTRS) -#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs,ZGBTRS) -#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs,SGTTRS) -#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs,DGTTRS) -#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs,CGTTRS) -#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs,ZGTTRS) -#define LAPACK_spotrs LAPACK_GLOBAL(spotrs,SPOTRS) -#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs,DPOTRS) -#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs,CPOTRS) -#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs,ZPOTRS) -#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs,DPFTRS) -#define LAPACK_spftrs LAPACK_GLOBAL(spftrs,SPFTRS) -#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs,ZPFTRS) -#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs,CPFTRS) -#define LAPACK_spptrs LAPACK_GLOBAL(spptrs,SPPTRS) -#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs,DPPTRS) -#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs,CPPTRS) -#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs,ZPPTRS) -#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs,SPBTRS) -#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs,DPBTRS) -#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs,CPBTRS) -#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs,ZPBTRS) -#define LAPACK_spttrs LAPACK_GLOBAL(spttrs,SPTTRS) -#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs,DPTTRS) -#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs,CPTTRS) -#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs,ZPTTRS) -#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs,SSYTRS) -#define LAPACK_ssytrs_rook LAPACK_GLOBAL(ssytrs_rook,SSYTRS_ROOK) -#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs,DSYTRS) -#define LAPACK_dsytrs_rook LAPACK_GLOBAL(dsytrs_rook,DSYTRS_ROOK) -#define LAPACK_csytrs LAPACK_GLOBAL(csytrs,CSYTRS) -#define LAPACK_csytrs_rook LAPACK_GLOBAL(csytrs_rook,CSYTRS_ROOK) -#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs,ZSYTRS) -#define LAPACK_zsytrs_rook LAPACK_GLOBAL(zsytrs_rook,ZSYTRS_ROOK) -#define LAPACK_chetrs LAPACK_GLOBAL(chetrs,CHETRS) -#define LAPACK_chetrs_rook LAPACK_GLOBAL(chetrs_rook,CHETRS_ROOK) -#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs,ZHETRS) -#define LAPACK_zhetrs_rook LAPACK_GLOBAL(zhetrs_rook,ZHETRS_ROOK) -#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs,SSPTRS) -#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs,DSPTRS) -#define LAPACK_csptrs LAPACK_GLOBAL(csptrs,CSPTRS) -#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs,ZSPTRS) -#define LAPACK_chptrs LAPACK_GLOBAL(chptrs,CHPTRS) -#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs,ZHPTRS) -#define LAPACK_strtrs LAPACK_GLOBAL(strtrs,STRTRS) -#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs,DTRTRS) -#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs,CTRTRS) -#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs,ZTRTRS) -#define LAPACK_stptrs LAPACK_GLOBAL(stptrs,STPTRS) -#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs,DTPTRS) -#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs,CTPTRS) -#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs,ZTPTRS) -#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs,STBTRS) -#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs,DTBTRS) -#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs,CTBTRS) -#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs,ZTBTRS) -#define LAPACK_sgecon LAPACK_GLOBAL(sgecon,SGECON) -#define LAPACK_dgecon LAPACK_GLOBAL(dgecon,DGECON) -#define LAPACK_cgecon LAPACK_GLOBAL(cgecon,CGECON) -#define LAPACK_zgecon LAPACK_GLOBAL(zgecon,ZGECON) -#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon,SGBCON) -#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon,DGBCON) -#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon,CGBCON) -#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon,ZGBCON) -#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon,SGTCON) -#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon,DGTCON) -#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon,CGTCON) -#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon,ZGTCON) -#define LAPACK_spocon LAPACK_GLOBAL(spocon,SPOCON) -#define LAPACK_dpocon LAPACK_GLOBAL(dpocon,DPOCON) -#define LAPACK_cpocon LAPACK_GLOBAL(cpocon,CPOCON) -#define LAPACK_zpocon LAPACK_GLOBAL(zpocon,ZPOCON) -#define LAPACK_sppcon LAPACK_GLOBAL(sppcon,SPPCON) -#define LAPACK_dppcon LAPACK_GLOBAL(dppcon,DPPCON) -#define LAPACK_cppcon LAPACK_GLOBAL(cppcon,CPPCON) -#define LAPACK_zppcon LAPACK_GLOBAL(zppcon,ZPPCON) -#define LAPACK_spbcon LAPACK_GLOBAL(spbcon,SPBCON) -#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon,DPBCON) -#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon,CPBCON) -#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon,ZPBCON) -#define LAPACK_sptcon LAPACK_GLOBAL(sptcon,SPTCON) -#define LAPACK_dptcon LAPACK_GLOBAL(dptcon,DPTCON) -#define LAPACK_cptcon LAPACK_GLOBAL(cptcon,CPTCON) -#define LAPACK_zptcon LAPACK_GLOBAL(zptcon,ZPTCON) -#define LAPACK_ssycon LAPACK_GLOBAL(ssycon,SSYCON) -#define LAPACK_dsycon LAPACK_GLOBAL(dsycon,DSYCON) -#define LAPACK_csycon LAPACK_GLOBAL(csycon,CSYCON) -#define LAPACK_zsycon LAPACK_GLOBAL(zsycon,ZSYCON) -#define LAPACK_checon LAPACK_GLOBAL(checon,CHECON) -#define LAPACK_zhecon LAPACK_GLOBAL(zhecon,ZHECON) -#define LAPACK_sspcon LAPACK_GLOBAL(sspcon,SSPCON) -#define LAPACK_dspcon LAPACK_GLOBAL(dspcon,DSPCON) -#define LAPACK_cspcon LAPACK_GLOBAL(cspcon,CSPCON) -#define LAPACK_zspcon LAPACK_GLOBAL(zspcon,ZSPCON) -#define LAPACK_chpcon LAPACK_GLOBAL(chpcon,CHPCON) -#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon,ZHPCON) -#define LAPACK_strcon LAPACK_GLOBAL(strcon,STRCON) -#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon,DTRCON) -#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon,CTRCON) -#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon,ZTRCON) -#define LAPACK_stpcon LAPACK_GLOBAL(stpcon,STPCON) -#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon,DTPCON) -#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon,CTPCON) -#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon,ZTPCON) -#define LAPACK_stbcon LAPACK_GLOBAL(stbcon,STBCON) -#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon,DTBCON) -#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon,CTBCON) -#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon,ZTBCON) -#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs,SGERFS) -#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs,DGERFS) -#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs,CGERFS) -#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs,ZGERFS) -#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx,DGERFSX) -#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx,SGERFSX) -#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx,ZGERFSX) -#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx,CGERFSX) -#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs,SGBRFS) -#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs,DGBRFS) -#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs,CGBRFS) -#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs,ZGBRFS) -#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx,DGBRFSX) -#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx,SGBRFSX) -#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx,ZGBRFSX) -#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx,CGBRFSX) -#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs,SGTRFS) -#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs,DGTRFS) -#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs,CGTRFS) -#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs,ZGTRFS) -#define LAPACK_sporfs LAPACK_GLOBAL(sporfs,SPORFS) -#define LAPACK_dporfs LAPACK_GLOBAL(dporfs,DPORFS) -#define LAPACK_cporfs LAPACK_GLOBAL(cporfs,CPORFS) -#define LAPACK_zporfs LAPACK_GLOBAL(zporfs,ZPORFS) -#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx,DPORFSX) -#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx,SPORFSX) -#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx,ZPORFSX) -#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx,CPORFSX) -#define LAPACK_spprfs LAPACK_GLOBAL(spprfs,SPPRFS) -#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs,DPPRFS) -#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs,CPPRFS) -#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs,ZPPRFS) -#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs,SPBRFS) -#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs,DPBRFS) -#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs,CPBRFS) -#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs,ZPBRFS) -#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs,SPTRFS) -#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs,DPTRFS) -#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs,CPTRFS) -#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs,ZPTRFS) -#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs,SSYRFS) -#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs,DSYRFS) -#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs,CSYRFS) -#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs,ZSYRFS) -#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx,DSYRFSX) -#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx,SSYRFSX) -#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx,ZSYRFSX) -#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx,CSYRFSX) -#define LAPACK_cherfs LAPACK_GLOBAL(cherfs,CHERFS) -#define LAPACK_zherfs LAPACK_GLOBAL(zherfs,ZHERFS) -#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx,ZHERFSX) -#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx,CHERFSX) -#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs,SSPRFS) -#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs,DSPRFS) -#define LAPACK_csprfs LAPACK_GLOBAL(csprfs,CSPRFS) -#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs,ZSPRFS) -#define LAPACK_chprfs LAPACK_GLOBAL(chprfs,CHPRFS) -#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs,ZHPRFS) -#define LAPACK_strrfs LAPACK_GLOBAL(strrfs,STRRFS) -#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs,DTRRFS) -#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs,CTRRFS) -#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs,ZTRRFS) -#define LAPACK_stprfs LAPACK_GLOBAL(stprfs,STPRFS) -#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs,DTPRFS) -#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs,CTPRFS) -#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs,ZTPRFS) -#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs,STBRFS) -#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs,DTBRFS) -#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs,CTBRFS) -#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs,ZTBRFS) -#define LAPACK_sgetri LAPACK_GLOBAL(sgetri,SGETRI) -#define LAPACK_dgetri LAPACK_GLOBAL(dgetri,DGETRI) -#define LAPACK_cgetri LAPACK_GLOBAL(cgetri,CGETRI) -#define LAPACK_zgetri LAPACK_GLOBAL(zgetri,ZGETRI) -#define LAPACK_spotri LAPACK_GLOBAL(spotri,SPOTRI) -#define LAPACK_dpotri LAPACK_GLOBAL(dpotri,DPOTRI) -#define LAPACK_cpotri LAPACK_GLOBAL(cpotri,CPOTRI) -#define LAPACK_zpotri LAPACK_GLOBAL(zpotri,ZPOTRI) -#define LAPACK_dpftri LAPACK_GLOBAL(dpftri,DPFTRI) -#define LAPACK_spftri LAPACK_GLOBAL(spftri,SPFTRI) -#define LAPACK_zpftri LAPACK_GLOBAL(zpftri,ZPFTRI) -#define LAPACK_cpftri LAPACK_GLOBAL(cpftri,CPFTRI) -#define LAPACK_spptri LAPACK_GLOBAL(spptri,SPPTRI) -#define LAPACK_dpptri LAPACK_GLOBAL(dpptri,DPPTRI) -#define LAPACK_cpptri LAPACK_GLOBAL(cpptri,CPPTRI) -#define LAPACK_zpptri LAPACK_GLOBAL(zpptri,ZPPTRI) -#define LAPACK_ssytri LAPACK_GLOBAL(ssytri,SSYTRI) -#define LAPACK_dsytri LAPACK_GLOBAL(dsytri,DSYTRI) -#define LAPACK_csytri LAPACK_GLOBAL(csytri,CSYTRI) -#define LAPACK_zsytri LAPACK_GLOBAL(zsytri,ZSYTRI) -#define LAPACK_chetri LAPACK_GLOBAL(chetri,CHETRI) -#define LAPACK_zhetri LAPACK_GLOBAL(zhetri,ZHETRI) -#define LAPACK_ssptri LAPACK_GLOBAL(ssptri,SSPTRI) -#define LAPACK_dsptri LAPACK_GLOBAL(dsptri,DSPTRI) -#define LAPACK_csptri LAPACK_GLOBAL(csptri,CSPTRI) -#define LAPACK_zsptri LAPACK_GLOBAL(zsptri,ZSPTRI) -#define LAPACK_chptri LAPACK_GLOBAL(chptri,CHPTRI) -#define LAPACK_zhptri LAPACK_GLOBAL(zhptri,ZHPTRI) -#define LAPACK_strtri LAPACK_GLOBAL(strtri,STRTRI) -#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri,DTRTRI) -#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri,CTRTRI) -#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri,ZTRTRI) -#define LAPACK_dtftri LAPACK_GLOBAL(dtftri,DTFTRI) -#define LAPACK_stftri LAPACK_GLOBAL(stftri,STFTRI) -#define LAPACK_ztftri LAPACK_GLOBAL(ztftri,ZTFTRI) -#define LAPACK_ctftri LAPACK_GLOBAL(ctftri,CTFTRI) -#define LAPACK_stptri LAPACK_GLOBAL(stptri,STPTRI) -#define LAPACK_dtptri LAPACK_GLOBAL(dtptri,DTPTRI) -#define LAPACK_ctptri LAPACK_GLOBAL(ctptri,CTPTRI) -#define LAPACK_ztptri LAPACK_GLOBAL(ztptri,ZTPTRI) -#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ,SGEEQU) -#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ,DGEEQU) -#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ,CGEEQU) -#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ,ZGEEQU) -#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb,DGEEQUB) -#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb,SGEEQUB) -#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb,ZGEEQUB) -#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb,CGEEQUB) -#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ,SGBEQU) -#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ,DGBEQU) -#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ,CGBEQU) -#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ,ZGBEQU) -#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb,DGBEQUB) -#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb,SGBEQUB) -#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb,ZGBEQUB) -#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb,CGBEQUB) -#define LAPACK_spoequ LAPACK_GLOBAL(spoequ,SPOEQU) -#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ,DPOEQU) -#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ,CPOEQU) -#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ,ZPOEQU) -#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb,DPOEQUB) -#define LAPACK_spoequb LAPACK_GLOBAL(spoequb,SPOEQUB) -#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb,ZPOEQUB) -#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb,CPOEQUB) -#define LAPACK_sppequ LAPACK_GLOBAL(sppequ,SPPEQU) -#define LAPACK_dppequ LAPACK_GLOBAL(dppequ,DPPEQU) -#define LAPACK_cppequ LAPACK_GLOBAL(cppequ,CPPEQU) -#define LAPACK_zppequ LAPACK_GLOBAL(zppequ,ZPPEQU) -#define LAPACK_spbequ LAPACK_GLOBAL(spbequ,SPBEQU) -#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ,DPBEQU) -#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ,CPBEQU) -#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ,ZPBEQU) -#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb,DSYEQUB) -#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb,SSYEQUB) -#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb,ZSYEQUB) -#define LAPACK_csyequb LAPACK_GLOBAL(csyequb,CSYEQUB) -#define LAPACK_zheequb LAPACK_GLOBAL(zheequb,ZHEEQUB) -#define LAPACK_cheequb LAPACK_GLOBAL(cheequb,CHEEQUB) -#define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV) -#define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV) -#define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV) -#define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV) -#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv,DSGESV) -#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv,ZCGESV) -#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx,SGESVX) -#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx,DGESVX) -#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx,CGESVX) -#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx,ZGESVX) -#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx,DGESVXX) -#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx,SGESVXX) -#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx,ZGESVXX) -#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx,CGESVXX) -#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv,SGBSV) -#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv,DGBSV) -#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv,CGBSV) -#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv,ZGBSV) -#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx,SGBSVX) -#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx,DGBSVX) -#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx,CGBSVX) -#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx,ZGBSVX) -#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx,DGBSVXX) -#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx,SGBSVXX) -#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx,ZGBSVXX) -#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx,CGBSVXX) -#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv,SGTSV) -#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv,DGTSV) -#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv,CGTSV) -#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv,ZGTSV) -#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx,SGTSVX) -#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx,DGTSVX) -#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx,CGTSVX) -#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx,ZGTSVX) -#define LAPACK_sposv LAPACK_GLOBAL(sposv,SPOSV) -#define LAPACK_dposv LAPACK_GLOBAL(dposv,DPOSV) -#define LAPACK_cposv LAPACK_GLOBAL(cposv,CPOSV) -#define LAPACK_zposv LAPACK_GLOBAL(zposv,ZPOSV) -#define LAPACK_dsposv LAPACK_GLOBAL(dsposv,DSPOSV) -#define LAPACK_zcposv LAPACK_GLOBAL(zcposv,ZCPOSV) -#define LAPACK_sposvx LAPACK_GLOBAL(sposvx,SPOSVX) -#define LAPACK_dposvx LAPACK_GLOBAL(dposvx,DPOSVX) -#define LAPACK_cposvx LAPACK_GLOBAL(cposvx,CPOSVX) -#define LAPACK_zposvx LAPACK_GLOBAL(zposvx,ZPOSVX) -#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx,DPOSVXX) -#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx,SPOSVXX) -#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx,ZPOSVXX) -#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx,CPOSVXX) -#define LAPACK_sppsv LAPACK_GLOBAL(sppsv,SPPSV) -#define LAPACK_dppsv LAPACK_GLOBAL(dppsv,DPPSV) -#define LAPACK_cppsv LAPACK_GLOBAL(cppsv,CPPSV) -#define LAPACK_zppsv LAPACK_GLOBAL(zppsv,ZPPSV) -#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx,SPPSVX) -#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx,DPPSVX) -#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx,CPPSVX) -#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx,ZPPSVX) -#define LAPACK_spbsv LAPACK_GLOBAL(spbsv,SPBSV) -#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv,DPBSV) -#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv,CPBSV) -#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv,ZPBSV) -#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx,SPBSVX) -#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx,DPBSVX) -#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx,CPBSVX) -#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx,ZPBSVX) -#define LAPACK_sptsv LAPACK_GLOBAL(sptsv,SPTSV) -#define LAPACK_dptsv LAPACK_GLOBAL(dptsv,DPTSV) -#define LAPACK_cptsv LAPACK_GLOBAL(cptsv,CPTSV) -#define LAPACK_zptsv LAPACK_GLOBAL(zptsv,ZPTSV) -#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx,SPTSVX) -#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx,DPTSVX) -#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx,CPTSVX) -#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx,ZPTSVX) -#define LAPACK_ssysv LAPACK_GLOBAL(ssysv,SSYSV) -#define LAPACK_dsysv LAPACK_GLOBAL(dsysv,DSYSV) -#define LAPACK_csysv LAPACK_GLOBAL(csysv,CSYSV) -#define LAPACK_zsysv LAPACK_GLOBAL(zsysv,ZSYSV) -#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx,SSYSVX) -#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx,DSYSVX) -#define LAPACK_csysvx LAPACK_GLOBAL(csysvx,CSYSVX) -#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx,ZSYSVX) -#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx,DSYSVXX) -#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx,SSYSVXX) -#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx,ZSYSVXX) -#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx,CSYSVXX) -#define LAPACK_chesv LAPACK_GLOBAL(chesv,CHESV) -#define LAPACK_zhesv LAPACK_GLOBAL(zhesv,ZHESV) -#define LAPACK_chesvx LAPACK_GLOBAL(chesvx,CHESVX) -#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx,ZHESVX) -#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx,ZHESVXX) -#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx,CHESVXX) -#define LAPACK_sspsv LAPACK_GLOBAL(sspsv,SSPSV) -#define LAPACK_dspsv LAPACK_GLOBAL(dspsv,DSPSV) -#define LAPACK_cspsv LAPACK_GLOBAL(cspsv,CSPSV) -#define LAPACK_zspsv LAPACK_GLOBAL(zspsv,ZSPSV) -#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx,SSPSVX) -#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx,DSPSVX) -#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx,CSPSVX) -#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx,ZSPSVX) -#define LAPACK_chpsv LAPACK_GLOBAL(chpsv,CHPSV) -#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv,ZHPSV) -#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx,CHPSVX) -#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx,ZHPSVX) -#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf,SGEQRF) -#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf,DGEQRF) -#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf,CGEQRF) -#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf,ZGEQRF) -#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf,SGEQPF) -#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf,DGEQPF) -#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf,CGEQPF) -#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf,ZGEQPF) -#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3,SGEQP3) -#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3,DGEQP3) -#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3,CGEQP3) -#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3,ZGEQP3) -#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr,SORGQR) -#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr,DORGQR) -#define LAPACK_sormqr LAPACK_GLOBAL(sormqr,SORMQR) -#define LAPACK_dormqr LAPACK_GLOBAL(dormqr,DORMQR) -#define LAPACK_cungqr LAPACK_GLOBAL(cungqr,CUNGQR) -#define LAPACK_zungqr LAPACK_GLOBAL(zungqr,ZUNGQR) -#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr,CUNMQR) -#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr,ZUNMQR) -#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf,SGELQF) -#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf,DGELQF) -#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf,CGELQF) -#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf,ZGELQF) -#define LAPACK_sorglq LAPACK_GLOBAL(sorglq,SORGLQ) -#define LAPACK_dorglq LAPACK_GLOBAL(dorglq,DORGLQ) -#define LAPACK_sormlq LAPACK_GLOBAL(sormlq,SORMLQ) -#define LAPACK_dormlq LAPACK_GLOBAL(dormlq,DORMLQ) -#define LAPACK_cunglq LAPACK_GLOBAL(cunglq,CUNGLQ) -#define LAPACK_zunglq LAPACK_GLOBAL(zunglq,ZUNGLQ) -#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq,CUNMLQ) -#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq,ZUNMLQ) -#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf,SGEQLF) -#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf,DGEQLF) -#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf,CGEQLF) -#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf,ZGEQLF) -#define LAPACK_sorgql LAPACK_GLOBAL(sorgql,SORGQL) -#define LAPACK_dorgql LAPACK_GLOBAL(dorgql,DORGQL) -#define LAPACK_cungql LAPACK_GLOBAL(cungql,CUNGQL) -#define LAPACK_zungql LAPACK_GLOBAL(zungql,ZUNGQL) -#define LAPACK_sormql LAPACK_GLOBAL(sormql,SORMQL) -#define LAPACK_dormql LAPACK_GLOBAL(dormql,DORMQL) -#define LAPACK_cunmql LAPACK_GLOBAL(cunmql,CUNMQL) -#define LAPACK_zunmql LAPACK_GLOBAL(zunmql,ZUNMQL) -#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf,SGERQF) -#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf,DGERQF) -#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf,CGERQF) -#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf,ZGERQF) -#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq,SORGRQ) -#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq,DORGRQ) -#define LAPACK_cungrq LAPACK_GLOBAL(cungrq,CUNGRQ) -#define LAPACK_zungrq LAPACK_GLOBAL(zungrq,ZUNGRQ) -#define LAPACK_sormrq LAPACK_GLOBAL(sormrq,SORMRQ) -#define LAPACK_dormrq LAPACK_GLOBAL(dormrq,DORMRQ) -#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq,CUNMRQ) -#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq,ZUNMRQ) -#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf,STZRZF) -#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf,DTZRZF) -#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf,CTZRZF) -#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf,ZTZRZF) -#define LAPACK_sormrz LAPACK_GLOBAL(sormrz,SORMRZ) -#define LAPACK_dormrz LAPACK_GLOBAL(dormrz,DORMRZ) -#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz,CUNMRZ) -#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz,ZUNMRZ) -#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf,SGGQRF) -#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf,DGGQRF) -#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf,CGGQRF) -#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf,ZGGQRF) -#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf,SGGRQF) -#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf,DGGRQF) -#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf,CGGRQF) -#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf,ZGGRQF) -#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd,SGEBRD) -#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd,DGEBRD) -#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd,CGEBRD) -#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd,ZGEBRD) -#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd,SGBBRD) -#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd,DGBBRD) -#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd,CGBBRD) -#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd,ZGBBRD) -#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr,SORGBR) -#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr,DORGBR) -#define LAPACK_sormbr LAPACK_GLOBAL(sormbr,SORMBR) -#define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR) -#define LAPACK_cungbr LAPACK_GLOBAL(cungbr,CUNGBR) -#define LAPACK_zungbr LAPACK_GLOBAL(zungbr,ZUNGBR) -#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR) -#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr,ZUNMBR) -#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr,SBDSQR) -#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr,DBDSQR) -#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr,CBDSQR) -#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr,ZBDSQR) -#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc,SBDSDC) -#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc,DBDSDC) -#define LAPACK_sbdsvdx LAPACK_GLOBAL(sbdsvdx,SBDSVDX) -#define LAPACK_dbdsvdx LAPACK_GLOBAL(dbdsvdx,DBDSVDX) -#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd,SSYTRD) -#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd,DSYTRD) -#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr,SORGTR) -#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr,DORGTR) -#define LAPACK_sormtr LAPACK_GLOBAL(sormtr,SORMTR) -#define LAPACK_dormtr LAPACK_GLOBAL(dormtr,DORMTR) -#define LAPACK_chetrd LAPACK_GLOBAL(chetrd,CHETRD) -#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd,ZHETRD) -#define LAPACK_cungtr LAPACK_GLOBAL(cungtr,CUNGTR) -#define LAPACK_zungtr LAPACK_GLOBAL(zungtr,ZUNGTR) -#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr,CUNMTR) -#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr,ZUNMTR) -#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd,SSPTRD) -#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd,DSPTRD) -#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr,SOPGTR) -#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr,DOPGTR) -#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr,SOPMTR) -#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr,DOPMTR) -#define LAPACK_chptrd LAPACK_GLOBAL(chptrd,CHPTRD) -#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd,ZHPTRD) -#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr,CUPGTR) -#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr,ZUPGTR) -#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr,CUPMTR) -#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr,ZUPMTR) -#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd,SSBTRD) -#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd,DSBTRD) -#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd,CHBTRD) -#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd,ZHBTRD) -#define LAPACK_ssterf LAPACK_GLOBAL(ssterf,SSTERF) -#define LAPACK_dsterf LAPACK_GLOBAL(dsterf,DSTERF) -#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr,SSTEQR) -#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr,DSTEQR) -#define LAPACK_csteqr LAPACK_GLOBAL(csteqr,CSTEQR) -#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr,ZSTEQR) -#define LAPACK_sstemr LAPACK_GLOBAL(sstemr,SSTEMR) -#define LAPACK_dstemr LAPACK_GLOBAL(dstemr,DSTEMR) -#define LAPACK_cstemr LAPACK_GLOBAL(cstemr,CSTEMR) -#define LAPACK_zstemr LAPACK_GLOBAL(zstemr,ZSTEMR) -#define LAPACK_sstedc LAPACK_GLOBAL(sstedc,SSTEDC) -#define LAPACK_dstedc LAPACK_GLOBAL(dstedc,DSTEDC) -#define LAPACK_cstedc LAPACK_GLOBAL(cstedc,CSTEDC) -#define LAPACK_zstedc LAPACK_GLOBAL(zstedc,ZSTEDC) -#define LAPACK_sstegr LAPACK_GLOBAL(sstegr,SSTEGR) -#define LAPACK_dstegr LAPACK_GLOBAL(dstegr,DSTEGR) -#define LAPACK_cstegr LAPACK_GLOBAL(cstegr,CSTEGR) -#define LAPACK_zstegr LAPACK_GLOBAL(zstegr,ZSTEGR) -#define LAPACK_spteqr LAPACK_GLOBAL(spteqr,SPTEQR) -#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr,DPTEQR) -#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr,CPTEQR) -#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr,ZPTEQR) -#define LAPACK_sstebz LAPACK_GLOBAL(sstebz,SSTEBZ) -#define LAPACK_dstebz LAPACK_GLOBAL(dstebz,DSTEBZ) -#define LAPACK_sstein LAPACK_GLOBAL(sstein,SSTEIN) -#define LAPACK_dstein LAPACK_GLOBAL(dstein,DSTEIN) -#define LAPACK_cstein LAPACK_GLOBAL(cstein,CSTEIN) -#define LAPACK_zstein LAPACK_GLOBAL(zstein,ZSTEIN) -#define LAPACK_sdisna LAPACK_GLOBAL(sdisna,SDISNA) -#define LAPACK_ddisna LAPACK_GLOBAL(ddisna,DDISNA) -#define LAPACK_ssygst LAPACK_GLOBAL(ssygst,SSYGST) -#define LAPACK_dsygst LAPACK_GLOBAL(dsygst,DSYGST) -#define LAPACK_chegst LAPACK_GLOBAL(chegst,CHEGST) -#define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST) -#define LAPACK_sspgst LAPACK_GLOBAL(sspgst,SSPGST) -#define LAPACK_dspgst LAPACK_GLOBAL(dspgst,DSPGST) -#define LAPACK_chpgst LAPACK_GLOBAL(chpgst,CHPGST) -#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst,ZHPGST) -#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst,SSBGST) -#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst,DSBGST) -#define LAPACK_chbgst LAPACK_GLOBAL(chbgst,CHBGST) -#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst,ZHBGST) -#define LAPACK_spbstf LAPACK_GLOBAL(spbstf,SPBSTF) -#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf,DPBSTF) -#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf,CPBSTF) -#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf,ZPBSTF) -#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd,SGEHRD) -#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd,DGEHRD) -#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd,CGEHRD) -#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd,ZGEHRD) -#define LAPACK_sorghr LAPACK_GLOBAL(sorghr,SORGHR) -#define LAPACK_dorghr LAPACK_GLOBAL(dorghr,DORGHR) -#define LAPACK_sormhr LAPACK_GLOBAL(sormhr,SORMHR) -#define LAPACK_dormhr LAPACK_GLOBAL(dormhr,DORMHR) -#define LAPACK_cunghr LAPACK_GLOBAL(cunghr,CUNGHR) -#define LAPACK_zunghr LAPACK_GLOBAL(zunghr,ZUNGHR) -#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr,CUNMHR) -#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr,ZUNMHR) -#define LAPACK_sgebal LAPACK_GLOBAL(sgebal,SGEBAL) -#define LAPACK_dgebal LAPACK_GLOBAL(dgebal,DGEBAL) -#define LAPACK_cgebal LAPACK_GLOBAL(cgebal,CGEBAL) -#define LAPACK_zgebal LAPACK_GLOBAL(zgebal,ZGEBAL) -#define LAPACK_sgebak LAPACK_GLOBAL(sgebak,SGEBAK) -#define LAPACK_dgebak LAPACK_GLOBAL(dgebak,DGEBAK) -#define LAPACK_cgebak LAPACK_GLOBAL(cgebak,CGEBAK) -#define LAPACK_zgebak LAPACK_GLOBAL(zgebak,ZGEBAK) -#define LAPACK_shseqr LAPACK_GLOBAL(shseqr,SHSEQR) -#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr,DHSEQR) -#define LAPACK_chseqr LAPACK_GLOBAL(chseqr,CHSEQR) -#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr,ZHSEQR) -#define LAPACK_shsein LAPACK_GLOBAL(shsein,SHSEIN) -#define LAPACK_dhsein LAPACK_GLOBAL(dhsein,DHSEIN) -#define LAPACK_chsein LAPACK_GLOBAL(chsein,CHSEIN) -#define LAPACK_zhsein LAPACK_GLOBAL(zhsein,ZHSEIN) -#define LAPACK_strevc LAPACK_GLOBAL(strevc,STREVC) -#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc,DTREVC) -#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc,CTREVC) -#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc,ZTREVC) -#define LAPACK_strsna LAPACK_GLOBAL(strsna,STRSNA) -#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna,DTRSNA) -#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna,CTRSNA) -#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna,ZTRSNA) -#define LAPACK_strexc LAPACK_GLOBAL(strexc,STREXC) -#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc,DTREXC) -#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc,CTREXC) -#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc,ZTREXC) -#define LAPACK_strsen LAPACK_GLOBAL(strsen,STRSEN) -#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen,DTRSEN) -#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen,CTRSEN) -#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen,ZTRSEN) -#define LAPACK_strsyl LAPACK_GLOBAL(strsyl,STRSYL) -#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl,DTRSYL) -#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl,CTRSYL) -#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl,ZTRSYL) -#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd,SGGHRD) -#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd,DGGHRD) -#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd,CGGHRD) -#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd,ZGGHRD) -#define LAPACK_sgghd3 LAPACK_GLOBAL(sgghd3,SGGHD3) -#define LAPACK_dgghd3 LAPACK_GLOBAL(dgghd3,DGGHD3) -#define LAPACK_cgghd3 LAPACK_GLOBAL(cgghd3,CGGHD3) -#define LAPACK_zgghd3 LAPACK_GLOBAL(zgghd3,ZGGHD3) -#define LAPACK_sggbal LAPACK_GLOBAL(sggbal,SGGBAL) -#define LAPACK_dggbal LAPACK_GLOBAL(dggbal,DGGBAL) -#define LAPACK_cggbal LAPACK_GLOBAL(cggbal,CGGBAL) -#define LAPACK_zggbal LAPACK_GLOBAL(zggbal,ZGGBAL) -#define LAPACK_sggbak LAPACK_GLOBAL(sggbak,SGGBAK) -#define LAPACK_dggbak LAPACK_GLOBAL(dggbak,DGGBAK) -#define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK) -#define LAPACK_zggbak LAPACK_GLOBAL(zggbak,ZGGBAK) -#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz,SHGEQZ) -#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz,DHGEQZ) -#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz,CHGEQZ) -#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz,ZHGEQZ) -#define LAPACK_stgevc LAPACK_GLOBAL(stgevc,STGEVC) -#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc,DTGEVC) -#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc,CTGEVC) -#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc,ZTGEVC) -#define LAPACK_stgexc LAPACK_GLOBAL(stgexc,STGEXC) -#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc,DTGEXC) -#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc,CTGEXC) -#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc,ZTGEXC) -#define LAPACK_stgsen LAPACK_GLOBAL(stgsen,STGSEN) -#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen,DTGSEN) -#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen,CTGSEN) -#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen,ZTGSEN) -#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl,STGSYL) -#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl,DTGSYL) -#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl,CTGSYL) -#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl,ZTGSYL) -#define LAPACK_stgsna LAPACK_GLOBAL(stgsna,STGSNA) -#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna,DTGSNA) -#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna,CTGSNA) -#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna,ZTGSNA) -#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) -#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) -#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) -#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) -#define LAPACK_sggsvp3 LAPACK_GLOBAL(sggsvp3,SGGSVP3) -#define LAPACK_dggsvp3 LAPACK_GLOBAL(dggsvp3,DGGSVP3) -#define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) -#define LAPACK_zggsvp3 LAPACK_GLOBAL(zggsvp3,ZGGSVP3) -#define LAPACK_stgsja LAPACK_GLOBAL(stgsja,STGSJA) -#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja,DTGSJA) -#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja,CTGSJA) -#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja,ZTGSJA) -#define LAPACK_sgels LAPACK_GLOBAL(sgels,SGELS) -#define LAPACK_dgels LAPACK_GLOBAL(dgels,DGELS) -#define LAPACK_cgels LAPACK_GLOBAL(cgels,CGELS) -#define LAPACK_zgels LAPACK_GLOBAL(zgels,ZGELS) -#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy,SGELSY) -#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy,DGELSY) -#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy,CGELSY) -#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy,ZGELSY) -#define LAPACK_sgelss LAPACK_GLOBAL(sgelss,SGELSS) -#define LAPACK_dgelss LAPACK_GLOBAL(dgelss,DGELSS) -#define LAPACK_cgelss LAPACK_GLOBAL(cgelss,CGELSS) -#define LAPACK_zgelss LAPACK_GLOBAL(zgelss,ZGELSS) -#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd,SGELSD) -#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd,DGELSD) -#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd,CGELSD) -#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd,ZGELSD) -#define LAPACK_sgglse LAPACK_GLOBAL(sgglse,SGGLSE) -#define LAPACK_dgglse LAPACK_GLOBAL(dgglse,DGGLSE) -#define LAPACK_cgglse LAPACK_GLOBAL(cgglse,CGGLSE) -#define LAPACK_zgglse LAPACK_GLOBAL(zgglse,ZGGLSE) -#define LAPACK_sggglm LAPACK_GLOBAL(sggglm,SGGGLM) -#define LAPACK_dggglm LAPACK_GLOBAL(dggglm,DGGGLM) -#define LAPACK_cggglm LAPACK_GLOBAL(cggglm,CGGGLM) -#define LAPACK_zggglm LAPACK_GLOBAL(zggglm,ZGGGLM) -#define LAPACK_ssyev LAPACK_GLOBAL(ssyev,SSYEV) -#define LAPACK_dsyev LAPACK_GLOBAL(dsyev,DSYEV) -#define LAPACK_cheev LAPACK_GLOBAL(cheev,CHEEV) -#define LAPACK_zheev LAPACK_GLOBAL(zheev,ZHEEV) -#define LAPACK_ssyev_2stage LAPACK_GLOBAL(ssyev_2stage,SSYEV_2STAGE) -#define LAPACK_dsyev_2stage LAPACK_GLOBAL(dsyev_2stage,DSYEV_2STAGE) -#define LAPACK_cheev_2stage LAPACK_GLOBAL(cheev_2stage,CHEEV_2STAGE) -#define LAPACK_zheev_2stage LAPACK_GLOBAL(zheev_2stage,ZHEEV_2STAGE) -#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd,SSYEVD) -#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd,DSYEVD) -#define LAPACK_cheevd LAPACK_GLOBAL(cheevd,CHEEVD) -#define LAPACK_zheevd LAPACK_GLOBAL(zheevd,ZHEEVD) -#define LAPACK_ssyevd_2stage LAPACK_GLOBAL(ssyevd_2stage,SSYEVD_2STAGE) -#define LAPACK_dsyevd_2stage LAPACK_GLOBAL(dsyevd_2stage,DSYEVD_2STAGE) -#define LAPACK_cheevd_2stage LAPACK_GLOBAL(cheevd_2stage,CHEEVD_2STAGE) -#define LAPACK_zheevd_2stage LAPACK_GLOBAL(zheevd_2stage,ZHEEVD_2STAGE) -#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx,SSYEVX) -#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx,DSYEVX) -#define LAPACK_cheevx LAPACK_GLOBAL(cheevx,CHEEVX) -#define LAPACK_zheevx LAPACK_GLOBAL(zheevx,ZHEEVX) -#define LAPACK_ssyevx_2stage LAPACK_GLOBAL(ssyevx_2stage,SSYEVX_2STAGE) -#define LAPACK_dsyevx_2stage LAPACK_GLOBAL(dsyevx_2stage,DSYEVX_2STAGE) -#define LAPACK_cheevx_2stage LAPACK_GLOBAL(cheevx_2stage,CHEEVX_2STAGE) -#define LAPACK_zheevx_2stage LAPACK_GLOBAL(zheevx_2stage,ZHEEVX_2STAGE) -#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr,SSYEVR) -#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr,DSYEVR) -#define LAPACK_cheevr LAPACK_GLOBAL(cheevr,CHEEVR) -#define LAPACK_zheevr LAPACK_GLOBAL(zheevr,ZHEEVR) -#define LAPACK_ssyevr_2stage LAPACK_GLOBAL(ssyevr_2stage,SSYEVR_2STAGE) -#define LAPACK_dsyevr_2stage LAPACK_GLOBAL(dsyevr_2stage,DSYEVR_2STAGE) -#define LAPACK_cheevr_2stage LAPACK_GLOBAL(cheevr_2stage,CHEEVR_2STAGE) -#define LAPACK_zheevr_2stage LAPACK_GLOBAL(zheevr_2stage,ZHEEVR_2STAGE) -#define LAPACK_sspev LAPACK_GLOBAL(sspev,SSPEV) -#define LAPACK_dspev LAPACK_GLOBAL(dspev,DSPEV) -#define LAPACK_chpev LAPACK_GLOBAL(chpev,CHPEV) -#define LAPACK_zhpev LAPACK_GLOBAL(zhpev,ZHPEV) -#define LAPACK_sspevd LAPACK_GLOBAL(sspevd,SSPEVD) -#define LAPACK_dspevd LAPACK_GLOBAL(dspevd,DSPEVD) -#define LAPACK_chpevd LAPACK_GLOBAL(chpevd,CHPEVD) -#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd,ZHPEVD) -#define LAPACK_sspevx LAPACK_GLOBAL(sspevx,SSPEVX) -#define LAPACK_dspevx LAPACK_GLOBAL(dspevx,DSPEVX) -#define LAPACK_chpevx LAPACK_GLOBAL(chpevx,CHPEVX) -#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx,ZHPEVX) -#define LAPACK_ssbev LAPACK_GLOBAL(ssbev,SSBEV) -#define LAPACK_dsbev LAPACK_GLOBAL(dsbev,DSBEV) -#define LAPACK_chbev LAPACK_GLOBAL(chbev,CHBEV) -#define LAPACK_zhbev LAPACK_GLOBAL(zhbev,ZHBEV) -#define LAPACK_ssbev_2stage LAPACK_GLOBAL(ssbev_2stage,SSBEV_2STAGE) -#define LAPACK_dsbev_2stage LAPACK_GLOBAL(dsbev_2stage,DSBEV_2STAGE) -#define LAPACK_chbev_2stage LAPACK_GLOBAL(chbev_2stage,CHBEV_2STAGE) -#define LAPACK_zhbev_2stage LAPACK_GLOBAL(zhbev_2stage,ZHBEV_2STAGE) -#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd,SSBEVD) -#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd,DSBEVD) -#define LAPACK_chbevd LAPACK_GLOBAL(chbevd,CHBEVD) -#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd,ZHBEVD) -#define LAPACK_ssbevd_2stage LAPACK_GLOBAL(ssbevd_2stage,SSBEVD_2STAGE) -#define LAPACK_dsbevd_2stage LAPACK_GLOBAL(dsbevd_2stage,DSBEVD_2STAGE) -#define LAPACK_chbevd_2stage LAPACK_GLOBAL(chbevd_2stage,CHBEVD_2STAGE) -#define LAPACK_zhbevd_2stage LAPACK_GLOBAL(zhbevd_2stage,ZHBEVD_2STAGE) -#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx,SSBEVX) -#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx,DSBEVX) -#define LAPACK_chbevx LAPACK_GLOBAL(chbevx,CHBEVX) -#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx,ZHBEVX) -#define LAPACK_ssbevx_2stage LAPACK_GLOBAL(ssbevx_2stage,SSBEVX_2STAGE) -#define LAPACK_dsbevx_2stage LAPACK_GLOBAL(dsbevx_2stage,DSBEVX_2STAGE) -#define LAPACK_chbevx_2stage LAPACK_GLOBAL(chbevx_2stage,CHBEVX_2STAGE) -#define LAPACK_zhbevx_2stage LAPACK_GLOBAL(zhbevx_2stage,ZHBEVX_2STAGE) -#define LAPACK_sstev LAPACK_GLOBAL(sstev,SSTEV) -#define LAPACK_dstev LAPACK_GLOBAL(dstev,DSTEV) -#define LAPACK_sstevd LAPACK_GLOBAL(sstevd,SSTEVD) -#define LAPACK_dstevd LAPACK_GLOBAL(dstevd,DSTEVD) -#define LAPACK_sstevx LAPACK_GLOBAL(sstevx,SSTEVX) -#define LAPACK_dstevx LAPACK_GLOBAL(dstevx,DSTEVX) -#define LAPACK_sstevr LAPACK_GLOBAL(sstevr,SSTEVR) -#define LAPACK_dstevr LAPACK_GLOBAL(dstevr,DSTEVR) -#define LAPACK_sgees LAPACK_GLOBAL(sgees,SGEES) -#define LAPACK_dgees LAPACK_GLOBAL(dgees,DGEES) -#define LAPACK_cgees LAPACK_GLOBAL(cgees,CGEES) -#define LAPACK_zgees LAPACK_GLOBAL(zgees,ZGEES) -#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx,SGEESX) -#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx,DGEESX) -#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx,CGEESX) -#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx,ZGEESX) -#define LAPACK_sgeev LAPACK_GLOBAL(sgeev,SGEEV) -#define LAPACK_dgeev LAPACK_GLOBAL(dgeev,DGEEV) -#define LAPACK_cgeev LAPACK_GLOBAL(cgeev,CGEEV) -#define LAPACK_zgeev LAPACK_GLOBAL(zgeev,ZGEEV) -#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx,SGEEVX) -#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx,DGEEVX) -#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx,CGEEVX) -#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx,ZGEEVX) -#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd,SGESVD) -#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd,DGESVD) -#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd,CGESVD) -#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd,ZGESVD) -#define LAPACK_sgesvdx LAPACK_GLOBAL(sgesvdx,SGESVDX) -#define LAPACK_dgesvdx LAPACK_GLOBAL(dgesvdx,DGESVDX) -#define LAPACK_cgesvdx LAPACK_GLOBAL(cgesvdx,CGESVDX) -#define LAPACK_zgesvdx LAPACK_GLOBAL(zgesvdx,ZGESVDX) -#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd,SGESDD) -#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd,DGESDD) -#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd,CGESDD) -#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd,ZGESDD) -#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv,SGEJSV) -#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv,DGEJSV) -#define LAPACK_cgejsv LAPACK_GLOBAL(cgejsv,CGEJSV) -#define LAPACK_zgejsv LAPACK_GLOBAL(zgejsv,ZGEJSV) -#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj,SGESVJ) -#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj,DGESVJ) -#define LAPACK_cgesvj LAPACK_GLOBAL(cgesvj,CGESVJ) -#define LAPACK_zgesvj LAPACK_GLOBAL(zgesvj,ZGESVJ) -#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) -#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) -#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) -#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) -#define LAPACK_ssygv LAPACK_GLOBAL(ssygv,SSYGV) -#define LAPACK_dsygv LAPACK_GLOBAL(dsygv,DSYGV) -#define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV) -#define LAPACK_zhegv LAPACK_GLOBAL(zhegv,ZHEGV) -#define LAPACK_ssygv_2stage LAPACK_GLOBAL(ssygv_2stage,SSYGV_2STAGE) -#define LAPACK_dsygv_2stage LAPACK_GLOBAL(dsygv_2stage,DSYGV_2STAGE) -#define LAPACK_chegv_2stage LAPACK_GLOBAL(chegv_2stage,CHEGV_2STAGE) -#define LAPACK_zhegv_2stage LAPACK_GLOBAL(zhegv_2stage,ZHEGV_2STAGE) -#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd,SSYGVD) -#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd,DSYGVD) -#define LAPACK_chegvd LAPACK_GLOBAL(chegvd,CHEGVD) -#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd,ZHEGVD) -#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx,SSYGVX) -#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx,DSYGVX) -#define LAPACK_chegvx LAPACK_GLOBAL(chegvx,CHEGVX) -#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx,ZHEGVX) -#define LAPACK_sspgv LAPACK_GLOBAL(sspgv,SSPGV) -#define LAPACK_dspgv LAPACK_GLOBAL(dspgv,DSPGV) -#define LAPACK_chpgv LAPACK_GLOBAL(chpgv,CHPGV) -#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv,ZHPGV) -#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd,SSPGVD) -#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd,DSPGVD) -#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd,CHPGVD) -#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd,ZHPGVD) -#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx,SSPGVX) -#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx,DSPGVX) -#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx,CHPGVX) -#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx,ZHPGVX) -#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv,SSBGV) -#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv,DSBGV) -#define LAPACK_chbgv LAPACK_GLOBAL(chbgv,CHBGV) -#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv,ZHBGV) -#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd,SSBGVD) -#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd,DSBGVD) -#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd,CHBGVD) -#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd,ZHBGVD) -#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx,SSBGVX) -#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx,DSBGVX) -#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx,CHBGVX) -#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx,ZHBGVX) -#define LAPACK_sgges LAPACK_GLOBAL(sgges,SGGES) -#define LAPACK_dgges LAPACK_GLOBAL(dgges,DGGES) -#define LAPACK_cgges LAPACK_GLOBAL(cgges,CGGES) -#define LAPACK_zgges LAPACK_GLOBAL(zgges,ZGGES) -#define LAPACK_sgges3 LAPACK_GLOBAL(sgges3,SGGES3) -#define LAPACK_dgges3 LAPACK_GLOBAL(dgges3,DGGES3) -#define LAPACK_cgges3 LAPACK_GLOBAL(cgges3,CGGES3) -#define LAPACK_zgges3 LAPACK_GLOBAL(zgges3,ZGGES3) -#define LAPACK_sggesx LAPACK_GLOBAL(sggesx,SGGESX) -#define LAPACK_dggesx LAPACK_GLOBAL(dggesx,DGGESX) -#define LAPACK_cggesx LAPACK_GLOBAL(cggesx,CGGESX) -#define LAPACK_zggesx LAPACK_GLOBAL(zggesx,ZGGESX) -#define LAPACK_sggev LAPACK_GLOBAL(sggev,SGGEV) -#define LAPACK_dggev LAPACK_GLOBAL(dggev,DGGEV) -#define LAPACK_cggev LAPACK_GLOBAL(cggev,CGGEV) -#define LAPACK_zggev LAPACK_GLOBAL(zggev,ZGGEV) -#define LAPACK_sggev3 LAPACK_GLOBAL(sggev3,SGGEV3) -#define LAPACK_dggev3 LAPACK_GLOBAL(dggev3,DGGEV3) -#define LAPACK_cggev3 LAPACK_GLOBAL(cggev3,CGGEV3) -#define LAPACK_zggev3 LAPACK_GLOBAL(zggev3,ZGGEV3) -#define LAPACK_sggevx LAPACK_GLOBAL(sggevx,SGGEVX) -#define LAPACK_dggevx LAPACK_GLOBAL(dggevx,DGGEVX) -#define LAPACK_cggevx LAPACK_GLOBAL(cggevx,CGGEVX) -#define LAPACK_zggevx LAPACK_GLOBAL(zggevx,ZGGEVX) -#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk,DSFRK) -#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk,SSFRK) -#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk,ZHFRK) -#define LAPACK_chfrk LAPACK_GLOBAL(chfrk,CHFRK) -#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm,DTFSM) -#define LAPACK_stfsm LAPACK_GLOBAL(stfsm,STFSM) -#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm,ZTFSM) -#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm,CTFSM) -#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp,DTFTTP) -#define LAPACK_stfttp LAPACK_GLOBAL(stfttp,STFTTP) -#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp,ZTFTTP) -#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp,CTFTTP) -#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr,DTFTTR) -#define LAPACK_stfttr LAPACK_GLOBAL(stfttr,STFTTR) -#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr,ZTFTTR) -#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr,CTFTTR) -#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf,DTPTTF) -#define LAPACK_stpttf LAPACK_GLOBAL(stpttf,STPTTF) -#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf,ZTPTTF) -#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf,CTPTTF) -#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr,DTPTTR) -#define LAPACK_stpttr LAPACK_GLOBAL(stpttr,STPTTR) -#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr,ZTPTTR) -#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr,CTPTTR) -#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf,DTRTTF) -#define LAPACK_strttf LAPACK_GLOBAL(strttf,STRTTF) -#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf,ZTRTTF) -#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf,CTRTTF) -#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp,DTRTTP) -#define LAPACK_strttp LAPACK_GLOBAL(strttp,STRTTP) -#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp,ZTRTTP) -#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp,CTRTTP) -#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp,SGEQRFP) -#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp,DGEQRFP) -#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp,CGEQRFP) -#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp,ZGEQRFP) -#define LAPACK_clacgv LAPACK_GLOBAL(clacgv,CLACGV) -#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv,ZLACGV) -#define LAPACK_slarnv LAPACK_GLOBAL(slarnv,SLARNV) -#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv,DLARNV) -#define LAPACK_clarnv LAPACK_GLOBAL(clarnv,CLARNV) -#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv,ZLARNV) -#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2,SGEQR2) -#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2,DGEQR2) -#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2,CGEQR2) -#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2,ZGEQR2) -#define LAPACK_slacn2 LAPACK_GLOBAL(slacn2,SLACN2) -#define LAPACK_dlacn2 LAPACK_GLOBAL(dlacn2,DLACN2) -#define LAPACK_clacn2 LAPACK_GLOBAL(clacn2,CLACN2) -#define LAPACK_zlacn2 LAPACK_GLOBAL(zlacn2,ZLACN2) -#define LAPACK_slacpy LAPACK_GLOBAL(slacpy,SLACPY) -#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy,DLACPY) -#define LAPACK_clacpy LAPACK_GLOBAL(clacpy,CLACPY) -#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy,ZLACPY) -#define LAPACK_clacp2 LAPACK_GLOBAL(clacp2,CLACP2) -#define LAPACK_zlacp2 LAPACK_GLOBAL(zlacp2,ZLACP2) -#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2) -#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2) -#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2) -#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2) -#define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP) -#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP) -#define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP) -#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP) -#define LAPACK_slange LAPACK_GLOBAL(slange,SLANGE) -#define LAPACK_dlange LAPACK_GLOBAL(dlange,DLANGE) -#define LAPACK_clange LAPACK_GLOBAL(clange,CLANGE) -#define LAPACK_zlange LAPACK_GLOBAL(zlange,ZLANGE) -#define LAPACK_clanhe LAPACK_GLOBAL(clanhe,CLANHE) -#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe,ZLANHE) -#define LAPACK_clarcm LAPACK_GLOBAL(clarcm,CLARCM) -#define LAPACK_zlarcm LAPACK_GLOBAL(zlarcm,ZLARCM) -#define LAPACK_clacrm LAPACK_GLOBAL(clacrm,CLACRM) -#define LAPACK_zlacrm LAPACK_GLOBAL(zlacrm,ZLACRM) -#define LAPACK_slansy LAPACK_GLOBAL(slansy,SLANSY) -#define LAPACK_dlansy LAPACK_GLOBAL(dlansy,DLANSY) -#define LAPACK_clansy LAPACK_GLOBAL(clansy,CLANSY) -#define LAPACK_zlansy LAPACK_GLOBAL(zlansy,ZLANSY) -#define LAPACK_slantr LAPACK_GLOBAL(slantr,SLANTR) -#define LAPACK_dlantr LAPACK_GLOBAL(dlantr,DLANTR) -#define LAPACK_clantr LAPACK_GLOBAL(clantr,CLANTR) -#define LAPACK_zlantr LAPACK_GLOBAL(zlantr,ZLANTR) -#define LAPACK_slamch LAPACK_GLOBAL(slamch,SLAMCH) -#define LAPACK_dlamch LAPACK_GLOBAL(dlamch,DLAMCH) -#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2,SGELQ2) -#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2,DGELQ2) -#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2,CGELQ2) -#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2,ZGELQ2) -#define LAPACK_slarfb LAPACK_GLOBAL(slarfb,SLARFB) -#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb,DLARFB) -#define LAPACK_clarfb LAPACK_GLOBAL(clarfb,CLARFB) -#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb,ZLARFB) -#define LAPACK_slarfg LAPACK_GLOBAL(slarfg,SLARFG) -#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg,DLARFG) -#define LAPACK_clarfg LAPACK_GLOBAL(clarfg,CLARFG) -#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg,ZLARFG) -#define LAPACK_slassq LAPACK_GLOBAL(slassq,SLASSQ) -#define LAPACK_dlassq LAPACK_GLOBAL(dlassq,DLASSQ) -#define LAPACK_classq LAPACK_GLOBAL(classq,CLASSQ) -#define LAPACK_zlassq LAPACK_GLOBAL(zlassq,ZLASSQ) -#define LAPACK_slarft LAPACK_GLOBAL(slarft,SLARFT) -#define LAPACK_dlarft LAPACK_GLOBAL(dlarft,DLARFT) -#define LAPACK_clarft LAPACK_GLOBAL(clarft,CLARFT) -#define LAPACK_zlarft LAPACK_GLOBAL(zlarft,ZLARFT) -#define LAPACK_slarfx LAPACK_GLOBAL(slarfx,SLARFX) -#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx,DLARFX) -#define LAPACK_clarfx LAPACK_GLOBAL(clarfx,CLARFX) -#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx,ZLARFX) -#define LAPACK_slatms LAPACK_GLOBAL(slatms,SLATMS) -#define LAPACK_dlatms LAPACK_GLOBAL(dlatms,DLATMS) -#define LAPACK_clatms LAPACK_GLOBAL(clatms,CLATMS) -#define LAPACK_zlatms LAPACK_GLOBAL(zlatms,ZLATMS) -#define LAPACK_slag2d LAPACK_GLOBAL(slag2d,SLAG2D) -#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s,DLAG2S) -#define LAPACK_clag2z LAPACK_GLOBAL(clag2z,CLAG2Z) -#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c,ZLAG2C) -#define LAPACK_slauum LAPACK_GLOBAL(slauum,SLAUUM) -#define LAPACK_dlauum LAPACK_GLOBAL(dlauum,DLAUUM) -#define LAPACK_clauum LAPACK_GLOBAL(clauum,CLAUUM) -#define LAPACK_zlauum LAPACK_GLOBAL(zlauum,ZLAUUM) -#define LAPACK_slagge LAPACK_GLOBAL(slagge,SLAGGE) -#define LAPACK_dlagge LAPACK_GLOBAL(dlagge,DLAGGE) -#define LAPACK_clagge LAPACK_GLOBAL(clagge,CLAGGE) -#define LAPACK_zlagge LAPACK_GLOBAL(zlagge,ZLAGGE) -#define LAPACK_slascl LAPACK_GLOBAL(slascl,SLASCL) -#define LAPACK_dlascl LAPACK_GLOBAL(dlascl,DLASCL) -#define LAPACK_clascl LAPACK_GLOBAL(clascl,CLASCL) -#define LAPACK_zlascl LAPACK_GLOBAL(zlascl,ZLASCL) -#define LAPACK_slaset LAPACK_GLOBAL(slaset,SLASET) -#define LAPACK_dlaset LAPACK_GLOBAL(dlaset,DLASET) -#define LAPACK_claset LAPACK_GLOBAL(claset,CLASET) -#define LAPACK_zlaset LAPACK_GLOBAL(zlaset,ZLASET) -#define LAPACK_slasrt LAPACK_GLOBAL(slasrt,SLASRT) -#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt,DLASRT) -#define LAPACK_slagsy LAPACK_GLOBAL(slagsy,SLAGSY) -#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy,DLAGSY) -#define LAPACK_clagsy LAPACK_GLOBAL(clagsy,CLAGSY) -#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy,ZLAGSY) -#define LAPACK_claghe LAPACK_GLOBAL(claghe,CLAGHE) -#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe,ZLAGHE) -#define LAPACK_slapmr LAPACK_GLOBAL(slapmr,SLAPMR) -#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr,DLAPMR) -#define LAPACK_clapmr LAPACK_GLOBAL(clapmr,CLAPMR) -#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr,ZLAPMR) -#define LAPACK_slapmt LAPACK_GLOBAL(slapmt,SLAPMT) -#define LAPACK_dlapmt LAPACK_GLOBAL(dlapmt,DLAPMT) -#define LAPACK_clapmt LAPACK_GLOBAL(clapmt,CLAPMT) -#define LAPACK_zlapmt LAPACK_GLOBAL(zlapmt,ZLAPMT) -#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2,SLAPY2) -#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2,DLAPY2) -#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3,SLAPY3) -#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3,DLAPY3) -#define LAPACK_slartgp LAPACK_GLOBAL(slartgp,SLARTGP) -#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp,DLARTGP) -#define LAPACK_slartgs LAPACK_GLOBAL(slartgs,SLARTGS) -#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs,DLARTGS) -// LAPACK 3.3.0 -#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd,CBBCSD) -#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr,CHESWAPR) -#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2,CHETRI2) -#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x,CHETRI2X) -#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2,CHETRS2) -#define LAPACK_csyconv LAPACK_GLOBAL(csyconv,CSYCONV) -#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr,CSYSWAPR) -#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2,CSYTRI2) -#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x,CSYTRI2X) -#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2,CSYTRS2) -#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb,CUNBDB) -#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd,CUNCSD) -#define LAPACK_cuncsd2by1 LAPACK_GLOBAL(cuncsd2by1,CUNCSD2BY1) -#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd,DBBCSD) -#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb,DORBDB) -#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd,DORCSD) -#define LAPACK_dorcsd2by1 LAPACK_GLOBAL(dorcsd2by1,DORCSD2BY1) -#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv,DSYCONV) -#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr,DSYSWAPR) -#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2,DSYTRI2) -#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x,DSYTRI2X) -#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2,DSYTRS2) -#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd,SBBCSD) -#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb,SORBDB) -#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd,SORCSD) -#define LAPACK_sorcsd2by1 LAPACK_GLOBAL(sorcsd2by1,SORCSD2BY1) -#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv,SSYCONV) -#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr,SSYSWAPR) -#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2,SSYTRI2) -#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x,SSYTRI2X) -#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2,SSYTRS2) -#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd,ZBBCSD) -#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr,ZHESWAPR) -#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2,ZHETRI2) -#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x,ZHETRI2X) -#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2,ZHETRS2) -#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv,ZSYCONV) -#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr,ZSYSWAPR) -#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2,ZSYTRI2) -#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x,ZSYTRI2X) -#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2,ZSYTRS2) -#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb,ZUNBDB) -#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd,ZUNCSD) -#define LAPACK_zuncsd2by1 LAPACK_GLOBAL(zuncsd2by1,ZUNCSD2BY1) -// LAPACK 3.4.0 -#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt,SGEMQRT) -#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt,DGEMQRT) -#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt,CGEMQRT) -#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt,ZGEMQRT) -#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt,SGEQRT) -#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt,DGEQRT) -#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt,CGEQRT) -#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt,ZGEQRT) -#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2,SGEQRT2) -#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2,DGEQRT2) -#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2,CGEQRT2) -#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2,ZGEQRT2) -#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3,SGEQRT3) -#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3,DGEQRT3) -#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3,CGEQRT3) -#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3,ZGEQRT3) -#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt,STPMQRT) -#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt,DTPMQRT) -#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt,CTPMQRT) -#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt,ZTPMQRT) -#define LAPACK_stpqrt LAPACK_GLOBAL(stpqrt,STPQRT) -#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt,DTPQRT) -#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt,CTPQRT) -#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt,ZTPQRT) -#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2,STPQRT2) -#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2,DTPQRT2) -#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2,CTPQRT2) -#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2,ZTPQRT2) -#define LAPACK_stprfb LAPACK_GLOBAL(stprfb,STPRFB) -#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb,DTPRFB) -#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb,CTPRFB) -#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb,ZTPRFB) -// LAPACK 3.5.0 -#define LAPACK_ssysv_rook LAPACK_GLOBAL(ssysv_rook,SSYSV_ROOK) -#define LAPACK_dsysv_rook LAPACK_GLOBAL(dsysv_rook,DSYSV_ROOK) -#define LAPACK_csysv_rook LAPACK_GLOBAL(csysv_rook,CSYSV_ROOK) -#define LAPACK_zsysv_rook LAPACK_GLOBAL(zsysv_rook,ZSYSV_ROOK) -#define LAPACK_csyr LAPACK_GLOBAL(csyr,CSYR) -#define LAPACK_zsyr LAPACK_GLOBAL(zsyr,ZSYR) -#define LAPACK_ilaver LAPACK_GLOBAL(ilaver,ILAVER) -// LAPACK 3.6.0 -#define LAPACK_sggsvd3 LAPACK_GLOBAL(sggsvd3,SGGSVD3) -#define LAPACK_dggsvd3 LAPACK_GLOBAL(dggsvd3,DGGSVD3) -#define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) -#define LAPACK_zggsvd3 LAPACK_GLOBAL(zggsvd3,ZGGSVD3) -// LAPACK 3.7.0 -#define LAPACK_ssysv_aa LAPACK_GLOBAL(ssysv_aa,SSYSV_AA) -#define LAPACK_dsysv_aa LAPACK_GLOBAL(dsysv_aa,DSYSV_AA) -#define LAPACK_chesv_aa LAPACK_GLOBAL(chesv_aa,CHESV_AA) -#define LAPACK_zsysv_aa LAPACK_GLOBAL(zsysv_aa,ZSYSV_AA) -#define LAPACK_csysv_aa LAPACK_GLOBAL(csysv_aa,CSYSV_AA) -#define LAPACK_zhesv_aa LAPACK_GLOBAL(zhesv_aa,ZHESV_AA) -#define LAPACK_ssytrs_aa LAPACK_GLOBAL(ssytrs_aa,SSYTRS_AA) -#define LAPACK_dsytrs_aa LAPACK_GLOBAL(dsytrs_aa,DSYTRS_AA) -#define LAPACK_csytrs_aa LAPACK_GLOBAL(csytrs_aa,CSYTRS_AA) -#define LAPACK_zsytrs_aa LAPACK_GLOBAL(zsytrs_aa,ZSYTRS_AA) -#define LAPACK_chetrs_aa LAPACK_GLOBAL(chetrs_aa,CHETRS_AA) -#define LAPACK_zhetrs_aa LAPACK_GLOBAL(zhetrs_aa,ZHETRS_AA) -#define LAPACK_ssytrf_aa LAPACK_GLOBAL(ssytrf_aa,SSYTRF_AA) -#define LAPACK_dsytrf_aa LAPACK_GLOBAL(dsytrf_aa,DSYTRF_AA) -#define LAPACK_csytrf_aa LAPACK_GLOBAL(csytrf_aa,CSYTRF_AA) -#define LAPACK_zsytrf_aa LAPACK_GLOBAL(zsytrf_aa,ZSYTRF_AA) -#define LAPACK_chetrf_aa LAPACK_GLOBAL(chetrf_aa,CHETRF_AA) -#define LAPACK_zhetrf_aa LAPACK_GLOBAL(zhetrf_aa,ZHETRF_AA) - -#define LAPACK_ssysv_rk LAPACK_GLOBAL(ssysv_rk,SSYSV_RK) -#define LAPACK_dsysv_rk LAPACK_GLOBAL(dsysv_rk,DSYSV_RK) -#define LAPACK_chesv_rk LAPACK_GLOBAL(chesv_rk,CHESV_RK) -#define LAPACK_zsysv_rk LAPACK_GLOBAL(zsysv_rk,ZSYSV_RK) -#define LAPACK_csysv_rk LAPACK_GLOBAL(csysv_rk,CSYSV_RK) -#define LAPACK_zhesv_rk LAPACK_GLOBAL(zhesv_rk,ZHESV_RK) -#define LAPACK_ssytrf_rk LAPACK_GLOBAL(ssytrf_rk,SSYTRF_RK) -#define LAPACK_dsytrf_rk LAPACK_GLOBAL(dsytrf_rk,DSYTRF_RK) -#define LAPACK_csytrf_rk LAPACK_GLOBAL(csytrf_rk,CSYTRF_RK) -#define LAPACK_zsytrf_rk LAPACK_GLOBAL(zsytrf_rk,ZSYTRF_RK) -#define LAPACK_chetrf_rk LAPACK_GLOBAL(chetrf_rk,CHETRF_RK) -#define LAPACK_zhetrf_rk LAPACK_GLOBAL(zhetrf_rk,ZHETRF_RK) -#define LAPACK_ssytrs_3 LAPACK_GLOBAL(ssytrs_3,SSYTRS_3) -#define LAPACK_dsytrs_3 LAPACK_GLOBAL(dsytrs_3,DSYTRS_3) -#define LAPACK_csytrs_3 LAPACK_GLOBAL(csytrs_3,CSYTRS_3) -#define LAPACK_zsytrs_3 LAPACK_GLOBAL(zsytrs_3,ZSYTRS_3) -#define LAPACK_chetrs_3 LAPACK_GLOBAL(chetrs_3,CHETRS_3) -#define LAPACK_zhetrs_3 LAPACK_GLOBAL(zhetrs_3,ZHETRS_3) -#define LAPACK_ssytri_3 LAPACK_GLOBAL(ssytri_3,SSYTRI_3) -#define LAPACK_dsytri_3 LAPACK_GLOBAL(dsytri_3,DSYTRI_3) -#define LAPACK_csytri_3 LAPACK_GLOBAL(csytri_3,CSYTRI_3) -#define LAPACK_zsytri_3 LAPACK_GLOBAL(zsytri_3,ZSYTRI_3) -#define LAPACK_chetri_3 LAPACK_GLOBAL(chetri_3,CHETRI_3) -#define LAPACK_zhetri_3 LAPACK_GLOBAL(zhetri_3,ZHETRI_3) -#define LAPACK_ssycon_3 LAPACK_GLOBAL(ssycon_3,SSYCON_3) -#define LAPACK_dsycon_3 LAPACK_GLOBAL(dsycon_3,DSYCON_3) -#define LAPACK_csycon_3 LAPACK_GLOBAL(csycon_3,CSYCON_3) -#define LAPACK_zsycon_3 LAPACK_GLOBAL(zsycon_3,ZSYCON_3) -#define LAPACK_checon_3 LAPACK_GLOBAL(checon_3,CHECON_3) -#define LAPACK_zhecon_3 LAPACK_GLOBAL(zhecon_3,ZHECON_3) -#define LAPACK_sgelq LAPACK_GLOBAL(sgelq,SGELQ) -#define LAPACK_dgelq LAPACK_GLOBAL(dgelq,DGELQ) -#define LAPACK_cgelq LAPACK_GLOBAL(cgelq,CGELQ) -#define LAPACK_zgelq LAPACK_GLOBAL(zgelq,ZGELQ) -#define LAPACK_sgemlq LAPACK_GLOBAL(sgemlq,SGEMLQ) -#define LAPACK_dgemlq LAPACK_GLOBAL(dgemlq,DGEMLQ) -#define LAPACK_cgemlq LAPACK_GLOBAL(cgemlq,CGEMLQ) -#define LAPACK_zgemlq LAPACK_GLOBAL(zgemlq,ZGEMLQ) -#define LAPACK_sgeqr LAPACK_GLOBAL(sgeqr,SGEQR) -#define LAPACK_dgeqr LAPACK_GLOBAL(dgeqr,DGEQR) -#define LAPACK_cgeqr LAPACK_GLOBAL(cgeqr,CGEQR) -#define LAPACK_zgeqr LAPACK_GLOBAL(zgeqr,ZGEQR) -#define LAPACK_sgemqr LAPACK_GLOBAL(sgemqr,SGEMQR) -#define LAPACK_dgemqr LAPACK_GLOBAL(dgemqr,DGEMQR) -#define LAPACK_cgemqr LAPACK_GLOBAL(cgemqr,CGEMQR) -#define LAPACK_zgemqr LAPACK_GLOBAL(zgemqr,ZGEMQR) -#define LAPACK_sgetsls LAPACK_GLOBAL(sgetsls,SGETSLS) -#define LAPACK_dgetsls LAPACK_GLOBAL(dgetsls,DGETSLS) -#define LAPACK_cgetsls LAPACK_GLOBAL(cgetsls,CGETSLS) -#define LAPACK_zgetsls LAPACK_GLOBAL(zgetsls,ZGETSLS) - -// LAPACK 3.8.0 -#define LAPACK_ssysv_aa_2stage LAPACK_GLOBAL(ssysv_aa_2stage,SSYSV_AA_2STAGE) -#define LAPACK_dsysv_aa_2stage LAPACK_GLOBAL(dsysv_aa_2stage,DSYSV_AA_2STAGE) -#define LAPACK_chesv_aa_2stage LAPACK_GLOBAL(chesv_aa_2stage,CHESV_AA_2STAGE) -#define LAPACK_zsysv_aa_2stage LAPACK_GLOBAL(zsysv_aa_2stage,ZSYSV_AA_2STAGE) -#define LAPACK_csysv_aa_2stage LAPACK_GLOBAL(csysv_aa_2stage,CSYSV_AA_2STAGE) -#define LAPACK_zhesv_aa_2stage LAPACK_GLOBAL(zhesv_aa_2stage,ZHESV_AA_2STAGE) -#define LAPACK_ssytrs_aa_2stage LAPACK_GLOBAL(ssytrs_aa_2stage,SSYTRS_AA_2STAGE) -#define LAPACK_dsytrs_aa_2stage LAPACK_GLOBAL(dsytrs_aa_2stage,DSYTRS_AA_2STAGE) -#define LAPACK_csytrs_aa_2stage LAPACK_GLOBAL(csytrs_aa_2stage,CSYTRS_AA_2STAGE) -#define LAPACK_zsytrs_aa_2stage LAPACK_GLOBAL(zsytrs_aa_2stage,ZSYTRS_AA_2STAGE) -#define LAPACK_chetrs_aa_2stage LAPACK_GLOBAL(chetrs_aa_2stage,CHETRS_AA_2STAGE) -#define LAPACK_zhetrs_aa_2stage LAPACK_GLOBAL(zhetrs_aa_2stage,ZHETRS_AA_2STAGE) -#define LAPACK_ssytrf_aa_2stage LAPACK_GLOBAL(ssytrf_aa_2stage,SSYTRF_AA_2STAGE) -#define LAPACK_dsytrf_aa_2stage LAPACK_GLOBAL(dsytrf_aa_2stage,DSYTRF_AA_2STAGE) -#define LAPACK_csytrf_aa_2stage LAPACK_GLOBAL(csytrf_aa_2stage,CSYTRF_AA_2STAGE) -#define LAPACK_zsytrf_aa_2stage LAPACK_GLOBAL(zsytrf_aa_2stage,ZSYTRF_AA_2STAGE) -#define LAPACK_chetrf_aa_2stage LAPACK_GLOBAL(chetrf_aa_2stage,CHETRF_AA_2STAGE) -#define LAPACK_zhetrf_aa_2stage LAPACK_GLOBAL(zhetrf_aa_2stage,ZHETRF_AA_2STAGE) - - -void LAPACK_sgetrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_dgetrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_cgetrf( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, lapack_int *info ); -void LAPACK_zgetrf( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, lapack_int *info ); -void LAPACK_sgetrf2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_dgetrf2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_cgetrf2( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, lapack_int *info ); -void LAPACK_zgetrf2( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, lapack_int *info ); -void LAPACK_sgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, float* ab, lapack_int* ldab, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_dgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, double* ab, lapack_int* ldab, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_cgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_complex_float* ab, lapack_int* ldab, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_zgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_complex_double* ab, lapack_int* ldab, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_sgttrf( lapack_int* n, float* dl, float* d, float* du, float* du2, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_dgttrf( lapack_int* n, double* dl, double* d, double* du, - double* du2, lapack_int* ipiv, lapack_int *info ); -void LAPACK_cgttrf( lapack_int* n, lapack_complex_float* dl, - lapack_complex_float* d, lapack_complex_float* du, - lapack_complex_float* du2, lapack_int* ipiv, - lapack_int *info ); -void LAPACK_zgttrf( lapack_int* n, lapack_complex_double* dl, - lapack_complex_double* d, lapack_complex_double* du, - lapack_complex_double* du2, lapack_int* ipiv, - lapack_int *info ); -void LAPACK_spotrf2( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_dpotrf2( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_cpotrf2( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_zpotrf2( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_spotrf( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_dpotrf( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_cpotrf( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_zpotrf( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_dpstrf( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int* piv, lapack_int* rank, double* tol, - double* work, lapack_int *info ); -void LAPACK_spstrf( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int* piv, lapack_int* rank, float* tol, float* work, - lapack_int *info ); -void LAPACK_zpstrf( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* piv, lapack_int* rank, - double* tol, double* work, lapack_int *info ); -void LAPACK_cpstrf( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* piv, lapack_int* rank, - float* tol, float* work, lapack_int *info ); -void LAPACK_dpftrf( char* transr, char* uplo, lapack_int* n, double* a, - lapack_int *info ); -void LAPACK_spftrf( char* transr, char* uplo, lapack_int* n, float* a, - lapack_int *info ); -void LAPACK_zpftrf( char* transr, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int *info ); -void LAPACK_cpftrf( char* transr, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int *info ); -void LAPACK_spptrf( char* uplo, lapack_int* n, float* ap, lapack_int *info ); -void LAPACK_dpptrf( char* uplo, lapack_int* n, double* ap, lapack_int *info ); -void LAPACK_cpptrf( char* uplo, lapack_int* n, lapack_complex_float* ap, - lapack_int *info ); -void LAPACK_zpptrf( char* uplo, lapack_int* n, lapack_complex_double* ap, - lapack_int *info ); -void LAPACK_spbtrf( char* uplo, lapack_int* n, lapack_int* kd, float* ab, - lapack_int* ldab, lapack_int *info ); -void LAPACK_dpbtrf( char* uplo, lapack_int* n, lapack_int* kd, double* ab, - lapack_int* ldab, lapack_int *info ); -void LAPACK_cpbtrf( char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_float* ab, lapack_int* ldab, - lapack_int *info ); -void LAPACK_zpbtrf( char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_double* ab, lapack_int* ldab, - lapack_int *info ); -void LAPACK_spttrf( lapack_int* n, float* d, float* e, lapack_int *info ); -void LAPACK_dpttrf( lapack_int* n, double* d, double* e, lapack_int *info ); -void LAPACK_cpttrf( lapack_int* n, float* d, lapack_complex_float* e, - lapack_int *info ); -void LAPACK_zpttrf( lapack_int* n, double* d, lapack_complex_double* e, - lapack_int *info ); -void LAPACK_ssytrf( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int* ipiv, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dsytrf( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int* ipiv, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csytrf( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsytrf( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chetrf( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhetrf( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ssptrf( char* uplo, lapack_int* n, float* ap, lapack_int* ipiv, - lapack_int *info ); -void LAPACK_dsptrf( char* uplo, lapack_int* n, double* ap, lapack_int* ipiv, - lapack_int *info ); -void LAPACK_csptrf( char* uplo, lapack_int* n, lapack_complex_float* ap, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_zsptrf( char* uplo, lapack_int* n, lapack_complex_double* ap, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_chptrf( char* uplo, lapack_int* n, lapack_complex_float* ap, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_zhptrf( char* uplo, lapack_int* n, lapack_complex_double* ap, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_sgetrs( char* trans, lapack_int* n, lapack_int* nrhs, - const float* a, lapack_int* lda, const lapack_int* ipiv, - float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_dgetrs( char* trans, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const lapack_int* ipiv, - double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_cgetrs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zgetrs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_sgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const float* ab, lapack_int* ldab, - const lapack_int* ipiv, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const double* ab, lapack_int* ldab, - const lapack_int* ipiv, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const lapack_complex_float* ab, - lapack_int* ldab, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const lapack_complex_double* ab, - lapack_int* ldab, const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_sgttrs( char* trans, lapack_int* n, lapack_int* nrhs, - const float* dl, const float* d, const float* du, - const float* du2, const lapack_int* ipiv, float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dgttrs( char* trans, lapack_int* n, lapack_int* nrhs, - const double* dl, const double* d, const double* du, - const double* du2, const lapack_int* ipiv, double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_cgttrs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* dl, - const lapack_complex_float* d, - const lapack_complex_float* du, - const lapack_complex_float* du2, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zgttrs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* dl, - const lapack_complex_double* d, - const lapack_complex_double* du, - const lapack_complex_double* du2, const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_spotrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, - lapack_int* lda, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dpotrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_cpotrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zpotrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, - const double* a, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_spftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, - const float* a, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_cpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_spptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const float* ap, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dpptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* ap, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cpptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zpptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_spbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const float* ab, lapack_int* ldab, float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const double* ab, lapack_int* ldab, double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_cpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const lapack_complex_float* ab, lapack_int* ldab, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_spttrs( lapack_int* n, lapack_int* nrhs, const float* d, - const float* e, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dpttrs( lapack_int* n, lapack_int* nrhs, const double* d, - const double* e, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cpttrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d, - const lapack_complex_float* e, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zpttrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* d, const lapack_complex_double* e, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_ssytrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, - lapack_int* lda, const lapack_int* ipiv, float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dsytrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const lapack_int* ipiv, - double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_csytrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zsytrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_chetrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zhetrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_ssptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const float* ap, const lapack_int* ipiv, float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dsptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* ap, const lapack_int* ipiv, double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_csptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zsptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_chptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zhptrs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_strtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const float* a, lapack_int* lda, float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dtrtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const double* a, lapack_int* lda, - double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_ctrtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_ztrtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_stptrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const float* ap, float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dtptrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const double* ap, double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_ctptrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* ap, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_ztptrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_double* ap, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_stbtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, const float* ab, - lapack_int* ldab, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dtbtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, const double* ab, - lapack_int* ldab, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_ctbtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, - const lapack_complex_float* ab, lapack_int* ldab, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_ztbtrs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, - const lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_sgecon( char* norm, lapack_int* n, const float* a, lapack_int* lda, - float* anorm, float* rcond, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgecon( char* norm, lapack_int* n, const double* a, lapack_int* lda, - double* anorm, double* rcond, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_cgecon( char* norm, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, float* anorm, float* rcond, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zgecon( char* norm, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, double* anorm, double* rcond, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, - const float* ab, lapack_int* ldab, const lapack_int* ipiv, - float* anorm, float* rcond, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, - const double* ab, lapack_int* ldab, const lapack_int* ipiv, - double* anorm, double* rcond, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_cgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, - const lapack_complex_float* ab, lapack_int* ldab, - const lapack_int* ipiv, float* anorm, float* rcond, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, - const lapack_complex_double* ab, lapack_int* ldab, - const lapack_int* ipiv, double* anorm, double* rcond, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sgtcon( char* norm, lapack_int* n, const float* dl, const float* d, - const float* du, const float* du2, const lapack_int* ipiv, - float* anorm, float* rcond, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgtcon( char* norm, lapack_int* n, const double* dl, - const double* d, const double* du, const double* du2, - const lapack_int* ipiv, double* anorm, double* rcond, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cgtcon( char* norm, lapack_int* n, const lapack_complex_float* dl, - const lapack_complex_float* d, - const lapack_complex_float* du, - const lapack_complex_float* du2, const lapack_int* ipiv, - float* anorm, float* rcond, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zgtcon( char* norm, lapack_int* n, const lapack_complex_double* dl, - const lapack_complex_double* d, - const lapack_complex_double* du, - const lapack_complex_double* du2, const lapack_int* ipiv, - double* anorm, double* rcond, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_spocon( char* uplo, lapack_int* n, const float* a, lapack_int* lda, - float* anorm, float* rcond, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dpocon( char* uplo, lapack_int* n, const double* a, lapack_int* lda, - double* anorm, double* rcond, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_cpocon( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, float* anorm, float* rcond, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zpocon( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, double* anorm, double* rcond, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sppcon( char* uplo, lapack_int* n, const float* ap, float* anorm, - float* rcond, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dppcon( char* uplo, lapack_int* n, const double* ap, double* anorm, - double* rcond, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cppcon( char* uplo, lapack_int* n, const lapack_complex_float* ap, - float* anorm, float* rcond, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zppcon( char* uplo, lapack_int* n, const lapack_complex_double* ap, - double* anorm, double* rcond, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_spbcon( char* uplo, lapack_int* n, lapack_int* kd, const float* ab, - lapack_int* ldab, float* anorm, float* rcond, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dpbcon( char* uplo, lapack_int* n, lapack_int* kd, const double* ab, - lapack_int* ldab, double* anorm, double* rcond, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cpbcon( char* uplo, lapack_int* n, lapack_int* kd, - const lapack_complex_float* ab, lapack_int* ldab, - float* anorm, float* rcond, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zpbcon( char* uplo, lapack_int* n, lapack_int* kd, - const lapack_complex_double* ab, lapack_int* ldab, - double* anorm, double* rcond, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_sptcon( lapack_int* n, const float* d, const float* e, float* anorm, - float* rcond, float* work, lapack_int *info ); -void LAPACK_dptcon( lapack_int* n, const double* d, const double* e, - double* anorm, double* rcond, double* work, - lapack_int *info ); -void LAPACK_cptcon( lapack_int* n, const float* d, - const lapack_complex_float* e, float* anorm, float* rcond, - float* work, lapack_int *info ); -void LAPACK_zptcon( lapack_int* n, const double* d, - const lapack_complex_double* e, double* anorm, - double* rcond, double* work, lapack_int *info ); -void LAPACK_ssycon( char* uplo, lapack_int* n, const float* a, lapack_int* lda, - const lapack_int* ipiv, float* anorm, float* rcond, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dsycon( char* uplo, lapack_int* n, const double* a, lapack_int* lda, - const lapack_int* ipiv, double* anorm, double* rcond, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_csycon( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, float* anorm, - float* rcond, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zsycon( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const lapack_int* ipiv, double* anorm, - double* rcond, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_checon( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, float* anorm, - float* rcond, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zhecon( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const lapack_int* ipiv, double* anorm, - double* rcond, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_sspcon( char* uplo, lapack_int* n, const float* ap, - const lapack_int* ipiv, float* anorm, float* rcond, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dspcon( char* uplo, lapack_int* n, const double* ap, - const lapack_int* ipiv, double* anorm, double* rcond, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cspcon( char* uplo, lapack_int* n, const lapack_complex_float* ap, - const lapack_int* ipiv, float* anorm, float* rcond, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zspcon( char* uplo, lapack_int* n, const lapack_complex_double* ap, - const lapack_int* ipiv, double* anorm, double* rcond, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_chpcon( char* uplo, lapack_int* n, const lapack_complex_float* ap, - const lapack_int* ipiv, float* anorm, float* rcond, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zhpcon( char* uplo, lapack_int* n, const lapack_complex_double* ap, - const lapack_int* ipiv, double* anorm, double* rcond, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_strcon( char* norm, char* uplo, char* diag, lapack_int* n, - const float* a, lapack_int* lda, float* rcond, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dtrcon( char* norm, char* uplo, char* diag, lapack_int* n, - const double* a, lapack_int* lda, double* rcond, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_ctrcon( char* norm, char* uplo, char* diag, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, - float* rcond, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ztrcon( char* norm, char* uplo, char* diag, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, - double* rcond, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_stpcon( char* norm, char* uplo, char* diag, lapack_int* n, - const float* ap, float* rcond, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dtpcon( char* norm, char* uplo, char* diag, lapack_int* n, - const double* ap, double* rcond, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_ctpcon( char* norm, char* uplo, char* diag, lapack_int* n, - const lapack_complex_float* ap, float* rcond, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ztpcon( char* norm, char* uplo, char* diag, lapack_int* n, - const lapack_complex_double* ap, double* rcond, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_stbcon( char* norm, char* uplo, char* diag, lapack_int* n, - lapack_int* kd, const float* ab, lapack_int* ldab, - float* rcond, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dtbcon( char* norm, char* uplo, char* diag, lapack_int* n, - lapack_int* kd, const double* ab, lapack_int* ldab, - double* rcond, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_ctbcon( char* norm, char* uplo, char* diag, lapack_int* n, - lapack_int* kd, const lapack_complex_float* ab, - lapack_int* ldab, float* rcond, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_ztbcon( char* norm, char* uplo, char* diag, lapack_int* n, - lapack_int* kd, const lapack_complex_double* ab, - lapack_int* ldab, double* rcond, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sgerfs( char* trans, lapack_int* n, lapack_int* nrhs, - const float* a, lapack_int* lda, const float* af, - lapack_int* ldaf, const lapack_int* ipiv, const float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, - float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgerfs( char* trans, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const double* af, - lapack_int* ldaf, const lapack_int* ipiv, const double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, - double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cgerfs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const lapack_int* ipiv, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zgerfs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const lapack_int* ipiv, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_dgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const double* af, - lapack_int* ldaf, const lapack_int* ipiv, const double* r, - const double* c, const double* b, lapack_int* ldb, - double* x, lapack_int* ldx, double* rcond, double* berr, - lapack_int* n_err_bnds, double* err_bnds_norm, - double* err_bnds_comp, lapack_int* nparams, double* params, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_sgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs, - const float* a, lapack_int* lda, const float* af, - lapack_int* ldaf, const lapack_int* ipiv, const float* r, - const float* c, const float* b, lapack_int* ldb, float* x, - lapack_int* ldx, float* rcond, float* berr, - lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_zgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const lapack_int* ipiv, const double* r, const double* c, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_cgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const lapack_int* ipiv, const float* r, const float* c, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_sgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const float* ab, lapack_int* ldab, - const float* afb, lapack_int* ldafb, const lapack_int* ipiv, - const float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const double* ab, lapack_int* ldab, - const double* afb, lapack_int* ldafb, - const lapack_int* ipiv, const double* b, lapack_int* ldb, - double* x, lapack_int* ldx, double* ferr, double* berr, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const lapack_complex_float* ab, - lapack_int* ldab, const lapack_complex_float* afb, - lapack_int* ldafb, const lapack_int* ipiv, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, const lapack_complex_double* ab, - lapack_int* ldab, const lapack_complex_double* afb, - lapack_int* ldafb, const lapack_int* ipiv, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_dgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, const double* ab, - lapack_int* ldab, const double* afb, lapack_int* ldafb, - const lapack_int* ipiv, const double* r, const double* c, - const double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* rcond, double* berr, - lapack_int* n_err_bnds, double* err_bnds_norm, - double* err_bnds_comp, lapack_int* nparams, double* params, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_sgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, const float* ab, - lapack_int* ldab, const float* afb, lapack_int* ldafb, - const lapack_int* ipiv, const float* r, const float* c, - const float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* rcond, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_zgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, - const lapack_complex_double* ab, lapack_int* ldab, - const lapack_complex_double* afb, lapack_int* ldafb, - const lapack_int* ipiv, const double* r, const double* c, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_cgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, - const lapack_complex_float* ab, lapack_int* ldab, - const lapack_complex_float* afb, lapack_int* ldafb, - const lapack_int* ipiv, const float* r, const float* c, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_sgtrfs( char* trans, lapack_int* n, lapack_int* nrhs, - const float* dl, const float* d, const float* du, - const float* dlf, const float* df, const float* duf, - const float* du2, const lapack_int* ipiv, const float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, - float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgtrfs( char* trans, lapack_int* n, lapack_int* nrhs, - const double* dl, const double* d, const double* du, - const double* dlf, const double* df, const double* duf, - const double* du2, const lapack_int* ipiv, const double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, - double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cgtrfs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* dl, - const lapack_complex_float* d, - const lapack_complex_float* du, - const lapack_complex_float* dlf, - const lapack_complex_float* df, - const lapack_complex_float* duf, - const lapack_complex_float* du2, const lapack_int* ipiv, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zgtrfs( char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* dl, - const lapack_complex_double* d, - const lapack_complex_double* du, - const lapack_complex_double* dlf, - const lapack_complex_double* df, - const lapack_complex_double* duf, - const lapack_complex_double* du2, const lapack_int* ipiv, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sporfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, - lapack_int* lda, const float* af, lapack_int* ldaf, - const float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dporfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const double* af, - lapack_int* ldaf, const double* b, lapack_int* ldb, - double* x, lapack_int* ldx, double* ferr, double* berr, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cporfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zporfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_dporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const double* af, - lapack_int* ldaf, const double* s, const double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, - double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_sporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const float* a, lapack_int* lda, const float* af, - lapack_int* ldaf, const float* s, const float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_zporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const double* s, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* rcond, double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_cporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const float* s, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* rcond, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_spprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const float* ap, const float* afp, const float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, - float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dpprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* ap, const double* afp, const double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, - double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cpprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, - const lapack_complex_float* afp, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zpprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, - const lapack_complex_double* afp, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_spbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const float* ab, lapack_int* ldab, const float* afb, - lapack_int* ldafb, const float* b, lapack_int* ldb, - float* x, lapack_int* ldx, float* ferr, float* berr, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const double* ab, lapack_int* ldab, const double* afb, - lapack_int* ldafb, const double* b, lapack_int* ldb, - double* x, lapack_int* ldx, double* ferr, double* berr, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const lapack_complex_float* ab, lapack_int* ldab, - const lapack_complex_float* afb, lapack_int* ldafb, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - const lapack_complex_double* ab, lapack_int* ldab, - const lapack_complex_double* afb, lapack_int* ldafb, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sptrfs( lapack_int* n, lapack_int* nrhs, const float* d, - const float* e, const float* df, const float* ef, - const float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* ferr, float* berr, float* work, lapack_int *info ); -void LAPACK_dptrfs( lapack_int* n, lapack_int* nrhs, const double* d, - const double* e, const double* df, const double* ef, - const double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* ferr, double* berr, double* work, - lapack_int *info ); -void LAPACK_cptrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d, - const lapack_complex_float* e, const float* df, - const lapack_complex_float* ef, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zptrfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* d, const lapack_complex_double* e, - const double* df, const lapack_complex_double* ef, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_ssyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, - lapack_int* lda, const float* af, lapack_int* ldaf, - const lapack_int* ipiv, const float* b, lapack_int* ldb, - float* x, lapack_int* ldx, float* ferr, float* berr, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const double* af, - lapack_int* ldaf, const lapack_int* ipiv, const double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, - double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_csyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const lapack_int* ipiv, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const lapack_int* ipiv, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_dsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const double* af, - lapack_int* ldaf, const lapack_int* ipiv, const double* s, - const double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* rcond, double* berr, - lapack_int* n_err_bnds, double* err_bnds_norm, - double* err_bnds_comp, lapack_int* nparams, double* params, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_ssyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const float* a, lapack_int* lda, const float* af, - lapack_int* ldaf, const lapack_int* ipiv, const float* s, - const float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* rcond, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_zsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const lapack_int* ipiv, const double* s, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_csyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const lapack_int* ipiv, const float* s, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_cherfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const lapack_int* ipiv, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zherfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const lapack_int* ipiv, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_zherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* af, lapack_int* ldaf, - const lapack_int* ipiv, const double* s, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_cherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* af, lapack_int* ldaf, - const lapack_int* ipiv, const float* s, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ssprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const float* ap, const float* afp, const lapack_int* ipiv, - const float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dsprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* ap, const double* afp, const lapack_int* ipiv, - const double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* ferr, double* berr, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_csprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, - const lapack_complex_float* afp, const lapack_int* ipiv, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zsprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, - const lapack_complex_double* afp, const lapack_int* ipiv, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_chprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, - const lapack_complex_float* afp, const lapack_int* ipiv, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zhprfs( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, - const lapack_complex_double* afp, const lapack_int* ipiv, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* ferr, - double* berr, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_strrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const float* a, lapack_int* lda, - const float* b, lapack_int* ldb, const float* x, - lapack_int* ldx, float* ferr, float* berr, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dtrrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const double* a, lapack_int* lda, - const double* b, lapack_int* ldb, const double* x, - lapack_int* ldx, double* ferr, double* berr, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_ctrrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* b, - lapack_int* ldb, const lapack_complex_float* x, - lapack_int* ldx, float* ferr, float* berr, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ztrrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* b, - lapack_int* ldb, const lapack_complex_double* x, - lapack_int* ldx, double* ferr, double* berr, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_stprfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const float* ap, const float* b, - lapack_int* ldb, const float* x, lapack_int* ldx, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dtprfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const double* ap, const double* b, - lapack_int* ldb, const double* x, lapack_int* ldx, - double* ferr, double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_ctprfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* ap, - const lapack_complex_float* b, lapack_int* ldb, - const lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ztprfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* nrhs, const lapack_complex_double* ap, - const lapack_complex_double* b, lapack_int* ldb, - const lapack_complex_double* x, lapack_int* ldx, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_stbrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, const float* ab, - lapack_int* ldab, const float* b, lapack_int* ldb, - const float* x, lapack_int* ldx, float* ferr, float* berr, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dtbrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, const double* ab, - lapack_int* ldab, const double* b, lapack_int* ldb, - const double* x, lapack_int* ldx, double* ferr, - double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_ctbrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, - const lapack_complex_float* ab, lapack_int* ldab, - const lapack_complex_float* b, lapack_int* ldb, - const lapack_complex_float* x, lapack_int* ldx, float* ferr, - float* berr, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ztbrfs( char* uplo, char* trans, char* diag, lapack_int* n, - lapack_int* kd, lapack_int* nrhs, - const lapack_complex_double* ab, lapack_int* ldab, - const lapack_complex_double* b, lapack_int* ldb, - const lapack_complex_double* x, lapack_int* ldx, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_sgetri( lapack_int* n, float* a, lapack_int* lda, - const lapack_int* ipiv, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgetri( lapack_int* n, double* a, lapack_int* lda, - const lapack_int* ipiv, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgetri( lapack_int* n, lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zgetri( lapack_int* n, lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_spotri( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_dpotri( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_cpotri( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_zpotri( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_dpftri( char* transr, char* uplo, lapack_int* n, double* a, - lapack_int *info ); -void LAPACK_spftri( char* transr, char* uplo, lapack_int* n, float* a, - lapack_int *info ); -void LAPACK_zpftri( char* transr, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int *info ); -void LAPACK_cpftri( char* transr, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int *info ); -void LAPACK_spptri( char* uplo, lapack_int* n, float* ap, lapack_int *info ); -void LAPACK_dpptri( char* uplo, lapack_int* n, double* ap, lapack_int *info ); -void LAPACK_cpptri( char* uplo, lapack_int* n, lapack_complex_float* ap, - lapack_int *info ); -void LAPACK_zpptri( char* uplo, lapack_int* n, lapack_complex_double* ap, - lapack_int *info ); -void LAPACK_ssytri( char* uplo, lapack_int* n, float* a, lapack_int* lda, - const lapack_int* ipiv, float* work, lapack_int *info ); -void LAPACK_dsytri( char* uplo, lapack_int* n, double* a, lapack_int* lda, - const lapack_int* ipiv, double* work, lapack_int *info ); -void LAPACK_csytri( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zsytri( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_chetri( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zhetri( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_ssptri( char* uplo, lapack_int* n, float* ap, - const lapack_int* ipiv, float* work, lapack_int *info ); -void LAPACK_dsptri( char* uplo, lapack_int* n, double* ap, - const lapack_int* ipiv, double* work, lapack_int *info ); -void LAPACK_csptri( char* uplo, lapack_int* n, lapack_complex_float* ap, - const lapack_int* ipiv, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zsptri( char* uplo, lapack_int* n, lapack_complex_double* ap, - const lapack_int* ipiv, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_chptri( char* uplo, lapack_int* n, lapack_complex_float* ap, - const lapack_int* ipiv, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zhptri( char* uplo, lapack_int* n, lapack_complex_double* ap, - const lapack_int* ipiv, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_strtri( char* uplo, char* diag, lapack_int* n, float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_dtrtri( char* uplo, char* diag, lapack_int* n, double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_ctrtri( char* uplo, char* diag, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_ztrtri( char* uplo, char* diag, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_dtftri( char* transr, char* uplo, char* diag, lapack_int* n, - double* a, lapack_int *info ); -void LAPACK_stftri( char* transr, char* uplo, char* diag, lapack_int* n, - float* a, lapack_int *info ); -void LAPACK_ztftri( char* transr, char* uplo, char* diag, lapack_int* n, - lapack_complex_double* a, lapack_int *info ); -void LAPACK_ctftri( char* transr, char* uplo, char* diag, lapack_int* n, - lapack_complex_float* a, lapack_int *info ); -void LAPACK_stptri( char* uplo, char* diag, lapack_int* n, float* ap, - lapack_int *info ); -void LAPACK_dtptri( char* uplo, char* diag, lapack_int* n, double* ap, - lapack_int *info ); -void LAPACK_ctptri( char* uplo, char* diag, lapack_int* n, - lapack_complex_float* ap, lapack_int *info ); -void LAPACK_ztptri( char* uplo, char* diag, lapack_int* n, - lapack_complex_double* ap, lapack_int *info ); -void LAPACK_sgeequ( lapack_int* m, lapack_int* n, const float* a, - lapack_int* lda, float* r, float* c, float* rowcnd, - float* colcnd, float* amax, lapack_int *info ); -void LAPACK_dgeequ( lapack_int* m, lapack_int* n, const double* a, - lapack_int* lda, double* r, double* c, double* rowcnd, - double* colcnd, double* amax, lapack_int *info ); -void LAPACK_cgeequ( lapack_int* m, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, float* r, float* c, float* rowcnd, - float* colcnd, float* amax, lapack_int *info ); -void LAPACK_zgeequ( lapack_int* m, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, double* r, - double* c, double* rowcnd, double* colcnd, double* amax, - lapack_int *info ); -void LAPACK_dgeequb( lapack_int* m, lapack_int* n, const double* a, - lapack_int* lda, double* r, double* c, double* rowcnd, - double* colcnd, double* amax, lapack_int *info ); -void LAPACK_sgeequb( lapack_int* m, lapack_int* n, const float* a, - lapack_int* lda, float* r, float* c, float* rowcnd, - float* colcnd, float* amax, lapack_int *info ); -void LAPACK_zgeequb( lapack_int* m, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, double* r, - double* c, double* rowcnd, double* colcnd, double* amax, - lapack_int *info ); -void LAPACK_cgeequb( lapack_int* m, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, float* r, - float* c, float* rowcnd, float* colcnd, float* amax, - lapack_int *info ); -void LAPACK_sgbequ( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const float* ab, lapack_int* ldab, float* r, - float* c, float* rowcnd, float* colcnd, float* amax, - lapack_int *info ); -void LAPACK_dgbequ( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const double* ab, lapack_int* ldab, - double* r, double* c, double* rowcnd, double* colcnd, - double* amax, lapack_int *info ); -void LAPACK_cgbequ( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const lapack_complex_float* ab, - lapack_int* ldab, float* r, float* c, float* rowcnd, - float* colcnd, float* amax, lapack_int *info ); -void LAPACK_zgbequ( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const lapack_complex_double* ab, - lapack_int* ldab, double* r, double* c, double* rowcnd, - double* colcnd, double* amax, lapack_int *info ); -void LAPACK_dgbequb( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const double* ab, lapack_int* ldab, - double* r, double* c, double* rowcnd, double* colcnd, - double* amax, lapack_int *info ); -void LAPACK_sgbequb( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const float* ab, lapack_int* ldab, - float* r, float* c, float* rowcnd, float* colcnd, - float* amax, lapack_int *info ); -void LAPACK_zgbequb( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const lapack_complex_double* ab, - lapack_int* ldab, double* r, double* c, double* rowcnd, - double* colcnd, double* amax, lapack_int *info ); -void LAPACK_cgbequb( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const lapack_complex_float* ab, - lapack_int* ldab, float* r, float* c, float* rowcnd, - float* colcnd, float* amax, lapack_int *info ); -void LAPACK_spoequ( lapack_int* n, const float* a, lapack_int* lda, float* s, - float* scond, float* amax, lapack_int *info ); -void LAPACK_dpoequ( lapack_int* n, const double* a, lapack_int* lda, double* s, - double* scond, double* amax, lapack_int *info ); -void LAPACK_cpoequ( lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, float* s, float* scond, float* amax, - lapack_int *info ); -void LAPACK_zpoequ( lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, double* s, double* scond, double* amax, - lapack_int *info ); -void LAPACK_dpoequb( lapack_int* n, const double* a, lapack_int* lda, double* s, - double* scond, double* amax, lapack_int *info ); -void LAPACK_spoequb( lapack_int* n, const float* a, lapack_int* lda, float* s, - float* scond, float* amax, lapack_int *info ); -void LAPACK_zpoequb( lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, double* s, double* scond, double* amax, - lapack_int *info ); -void LAPACK_cpoequb( lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, float* s, float* scond, float* amax, - lapack_int *info ); -void LAPACK_sppequ( char* uplo, lapack_int* n, const float* ap, float* s, - float* scond, float* amax, lapack_int *info ); -void LAPACK_dppequ( char* uplo, lapack_int* n, const double* ap, double* s, - double* scond, double* amax, lapack_int *info ); -void LAPACK_cppequ( char* uplo, lapack_int* n, const lapack_complex_float* ap, - float* s, float* scond, float* amax, lapack_int *info ); -void LAPACK_zppequ( char* uplo, lapack_int* n, const lapack_complex_double* ap, - double* s, double* scond, double* amax, lapack_int *info ); -void LAPACK_spbequ( char* uplo, lapack_int* n, lapack_int* kd, const float* ab, - lapack_int* ldab, float* s, float* scond, float* amax, - lapack_int *info ); -void LAPACK_dpbequ( char* uplo, lapack_int* n, lapack_int* kd, const double* ab, - lapack_int* ldab, double* s, double* scond, double* amax, - lapack_int *info ); -void LAPACK_cpbequ( char* uplo, lapack_int* n, lapack_int* kd, - const lapack_complex_float* ab, lapack_int* ldab, float* s, - float* scond, float* amax, lapack_int *info ); -void LAPACK_zpbequ( char* uplo, lapack_int* n, lapack_int* kd, - const lapack_complex_double* ab, lapack_int* ldab, - double* s, double* scond, double* amax, lapack_int *info ); -void LAPACK_dsyequb( char* uplo, lapack_int* n, const double* a, - lapack_int* lda, double* s, double* scond, double* amax, - double* work, lapack_int *info ); -void LAPACK_ssyequb( char* uplo, lapack_int* n, const float* a, lapack_int* lda, - float* s, float* scond, float* amax, float* work, - lapack_int *info ); -void LAPACK_zsyequb( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, double* s, double* scond, double* amax, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_csyequb( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, float* s, float* scond, float* amax, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zheequb( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, double* s, double* scond, double* amax, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_cheequb( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, float* s, float* scond, float* amax, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_sgesv( lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, - lapack_int* ipiv, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, - lapack_int* ipiv, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dsgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, - lapack_int* ipiv, double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* work, float* swork, - lapack_int* iter, lapack_int *info ); -void LAPACK_zcgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - lapack_complex_double* work, lapack_complex_float* swork, - double* rwork, lapack_int* iter, lapack_int *info ); -void LAPACK_sgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, float* r, float* c, float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - double* a, lapack_int* lda, double* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, double* r, double* c, - double* b, lapack_int* ldb, double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_cgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, float* r, float* c, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, double* r, double* c, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_dgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - double* a, lapack_int* lda, double* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, double* r, double* c, - double* b, lapack_int* ldb, double* x, lapack_int* ldx, - double* rcond, double* rpvgrw, double* berr, - lapack_int* n_err_bnds, double* err_bnds_norm, - double* err_bnds_comp, lapack_int* nparams, double* params, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_sgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, float* r, float* c, - float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* rcond, float* rpvgrw, float* berr, - lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_zgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, double* r, double* c, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* rpvgrw, double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_cgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, float* r, float* c, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* rpvgrw, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_sgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, float* ab, lapack_int* ldab, - lapack_int* ipiv, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, double* ab, lapack_int* ldab, - lapack_int* ipiv, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, lapack_complex_float* ab, lapack_int* ldab, - lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku, - lapack_int* nrhs, lapack_complex_double* ab, - lapack_int* ldab, lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_sgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, float* ab, - lapack_int* ldab, float* afb, lapack_int* ldafb, - lapack_int* ipiv, char* equed, float* r, float* c, float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, double* ab, - lapack_int* ldab, double* afb, lapack_int* ldafb, - lapack_int* ipiv, char* equed, double* r, double* c, - double* b, lapack_int* ldb, double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_cgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab, - lapack_int* ldab, lapack_complex_float* afb, - lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r, - float* c, lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, lapack_complex_double* ab, - lapack_int* ldab, lapack_complex_double* afb, - lapack_int* ldafb, lapack_int* ipiv, char* equed, double* r, - double* c, lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_dgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, double* ab, - lapack_int* ldab, double* afb, lapack_int* ldafb, - lapack_int* ipiv, char* equed, double* r, double* c, - double* b, lapack_int* ldb, double* x, lapack_int* ldx, - double* rcond, double* rpvgrw, double* berr, - lapack_int* n_err_bnds, double* err_bnds_norm, - double* err_bnds_comp, lapack_int* nparams, double* params, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_sgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, float* ab, - lapack_int* ldab, float* afb, lapack_int* ldafb, - lapack_int* ipiv, char* equed, float* r, float* c, - float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* rcond, float* rpvgrw, float* berr, - lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_zgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, - lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* afb, lapack_int* ldafb, - lapack_int* ipiv, char* equed, double* r, double* c, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* rpvgrw, double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_cgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl, - lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab, - lapack_int* ldab, lapack_complex_float* afb, - lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r, - float* c, lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* rpvgrw, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_sgtsv( lapack_int* n, lapack_int* nrhs, float* dl, float* d, - float* du, float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_dgtsv( lapack_int* n, lapack_int* nrhs, double* dl, double* d, - double* du, double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_cgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* dl, - lapack_complex_float* d, lapack_complex_float* du, - lapack_complex_float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_zgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* dl, - lapack_complex_double* d, lapack_complex_double* du, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_sgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - const float* dl, const float* d, const float* du, - float* dlf, float* df, float* duf, float* du2, - lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, - lapack_int* ldx, float* rcond, float* ferr, float* berr, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - const double* dl, const double* d, const double* du, - double* dlf, double* df, double* duf, double* du2, - lapack_int* ipiv, const double* b, lapack_int* ldb, - double* x, lapack_int* ldx, double* rcond, double* ferr, - double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* dl, - const lapack_complex_float* d, - const lapack_complex_float* du, lapack_complex_float* dlf, - lapack_complex_float* df, lapack_complex_float* duf, - lapack_complex_float* du2, lapack_int* ipiv, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* dl, - const lapack_complex_double* d, - const lapack_complex_double* du, lapack_complex_double* dlf, - lapack_complex_double* df, lapack_complex_double* duf, - lapack_complex_double* du2, lapack_int* ipiv, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_sposv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cposv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_zposv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dsposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* work, float* swork, - lapack_int* iter, lapack_int *info ); -void LAPACK_zcposv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, - lapack_complex_double* work, lapack_complex_float* swork, - double* rwork, lapack_int* iter, lapack_int *info ); -void LAPACK_sposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* af, lapack_int* ldaf, - char* equed, float* s, float* b, lapack_int* ldb, float* x, - lapack_int* ldx, float* rcond, float* ferr, float* berr, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - double* a, lapack_int* lda, double* af, lapack_int* ldaf, - char* equed, double* s, double* b, lapack_int* ldb, - double* x, lapack_int* ldx, double* rcond, double* ferr, - double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, char* equed, - float* s, lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, char* equed, - double* s, lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_dposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - double* a, lapack_int* lda, double* af, lapack_int* ldaf, - char* equed, double* s, double* b, lapack_int* ldb, - double* x, lapack_int* ldx, double* rcond, double* rpvgrw, - double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_sposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* af, lapack_int* ldaf, - char* equed, float* s, float* b, lapack_int* ldb, float* x, - lapack_int* ldx, float* rcond, float* rpvgrw, float* berr, - lapack_int* n_err_bnds, float* err_bnds_norm, - float* err_bnds_comp, lapack_int* nparams, float* params, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_zposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, char* equed, - double* s, lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* rpvgrw, double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_cposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, char* equed, - float* s, lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* rpvgrw, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_sppsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, - float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_dppsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, - double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_cppsv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* ap, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zppsv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* ap, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_sppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - float* ap, float* afp, char* equed, float* s, float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - double* ap, double* afp, char* equed, double* s, double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* ap, lapack_complex_float* afp, - char* equed, float* s, lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* rcond, float* ferr, float* berr, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* ap, lapack_complex_double* afp, - char* equed, double* s, lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_spbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - float* ab, lapack_int* ldab, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - double* ab, lapack_int* ldab, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - lapack_complex_float* ab, lapack_int* ldab, - lapack_complex_float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_zpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, - lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_spbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd, - lapack_int* nrhs, float* ab, lapack_int* ldab, float* afb, - lapack_int* ldafb, char* equed, float* s, float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd, - lapack_int* nrhs, double* ab, lapack_int* ldab, double* afb, - lapack_int* ldafb, char* equed, double* s, double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd, - lapack_int* nrhs, lapack_complex_float* ab, - lapack_int* ldab, lapack_complex_float* afb, - lapack_int* ldafb, char* equed, float* s, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd, - lapack_int* nrhs, lapack_complex_double* ab, - lapack_int* ldab, lapack_complex_double* afb, - lapack_int* ldafb, char* equed, double* s, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_sptsv( lapack_int* n, lapack_int* nrhs, float* d, float* e, - float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_dptsv( lapack_int* n, lapack_int* nrhs, double* d, double* e, - double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_cptsv( lapack_int* n, lapack_int* nrhs, float* d, - lapack_complex_float* e, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zptsv( lapack_int* n, lapack_int* nrhs, double* d, - lapack_complex_double* e, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_sptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d, - const float* e, float* df, float* ef, const float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, float* work, lapack_int *info ); -void LAPACK_dptsvx( char* fact, lapack_int* n, lapack_int* nrhs, - const double* d, const double* e, double* df, double* ef, - const double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* rcond, double* ferr, double* berr, - double* work, lapack_int *info ); -void LAPACK_cptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d, - const lapack_complex_float* e, float* df, - lapack_complex_float* ef, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* rcond, float* ferr, float* berr, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zptsvx( char* fact, lapack_int* n, lapack_int* nrhs, - const double* d, const lapack_complex_double* e, double* df, - lapack_complex_double* ef, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_ssysv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, lapack_int* ipiv, float* b, lapack_int* ldb, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsysv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, lapack_int* ipiv, double* b, - lapack_int* ldb, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csysv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsysv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ssysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const float* a, lapack_int* lda, float* af, - lapack_int* ldaf, lapack_int* ipiv, const float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* ferr, float* berr, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, double* af, - lapack_int* ldaf, lapack_int* ipiv, const double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, - double* ferr, double* berr, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_csysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, - lapack_int* ipiv, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* rcond, float* ferr, float* berr, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, - lapack_int* ipiv, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_dsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - double* a, lapack_int* lda, double* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, double* s, double* b, - lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, - double* rpvgrw, double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_ssysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, float* s, float* b, - lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, - float* rpvgrw, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_zsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, double* s, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* rpvgrw, double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_csysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, float* s, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* rpvgrw, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_chesv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhesv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, - lapack_int* ipiv, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* rcond, float* ferr, float* berr, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zhesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, - lapack_int* ipiv, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_zhesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, double* s, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* x, lapack_int* ldx, double* rcond, - double* rpvgrw, double* berr, lapack_int* n_err_bnds, - double* err_bnds_norm, double* err_bnds_comp, - lapack_int* nparams, double* params, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_chesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* af, lapack_int* ldaf, - lapack_int* ipiv, char* equed, float* s, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* x, lapack_int* ldx, float* rcond, - float* rpvgrw, float* berr, lapack_int* n_err_bnds, - float* err_bnds_norm, float* err_bnds_comp, - lapack_int* nparams, float* params, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_sspsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, - lapack_int* ipiv, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dspsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, - lapack_int* ipiv, double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_cspsv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* ap, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_zspsv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* ap, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_sspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const float* ap, float* afp, lapack_int* ipiv, - const float* b, lapack_int* ldb, float* x, lapack_int* ldx, - float* rcond, float* ferr, float* berr, float* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const double* ap, double* afp, lapack_int* ipiv, - const double* b, lapack_int* ldb, double* x, - lapack_int* ldx, double* rcond, double* ferr, double* berr, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, lapack_complex_float* afp, - lapack_int* ipiv, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* rcond, float* ferr, float* berr, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, lapack_complex_double* afp, - lapack_int* ipiv, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_chpsv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* ap, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_zhpsv( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* ap, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_chpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* ap, lapack_complex_float* afp, - lapack_int* ipiv, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, - float* rcond, float* ferr, float* berr, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zhpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* ap, lapack_complex_double* afp, - lapack_int* ipiv, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, - double* rcond, double* ferr, double* berr, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sgeqrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgeqrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgeqrf( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgeqrf( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sgeqpf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - lapack_int* jpvt, float* tau, float* work, - lapack_int *info ); -void LAPACK_dgeqpf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - lapack_int* jpvt, double* tau, double* work, - lapack_int *info ); -void LAPACK_cgeqpf( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* jpvt, - lapack_complex_float* tau, lapack_complex_float* work, - float* rwork, lapack_int *info ); -void LAPACK_zgeqpf( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* jpvt, - lapack_complex_double* tau, lapack_complex_double* work, - double* rwork, lapack_int *info ); -void LAPACK_sgeqp3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - lapack_int* jpvt, float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dgeqp3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - lapack_int* jpvt, double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cgeqp3( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* jpvt, - lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int *info ); -void LAPACK_zgeqp3( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* jpvt, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int *info ); -void LAPACK_sorgqr( lapack_int* m, lapack_int* n, lapack_int* k, float* a, - lapack_int* lda, const float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dorgqr( lapack_int* m, lapack_int* n, lapack_int* k, double* a, - lapack_int* lda, const double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sormqr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const float* a, lapack_int* lda, - const float* tau, float* c, lapack_int* ldc, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dormqr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const double* a, lapack_int* lda, - const double* tau, double* c, lapack_int* ldc, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cungqr( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zungqr( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cunmqr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zunmqr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* tau, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sgelqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgelqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgelqf( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgelqf( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sorglq( lapack_int* m, lapack_int* n, lapack_int* k, float* a, - lapack_int* lda, const float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dorglq( lapack_int* m, lapack_int* n, lapack_int* k, double* a, - lapack_int* lda, const double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sormlq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const float* a, lapack_int* lda, - const float* tau, float* c, lapack_int* ldc, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dormlq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const double* a, lapack_int* lda, - const double* tau, double* c, lapack_int* ldc, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cunglq( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zunglq( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cunmlq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zunmlq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* tau, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sgeqlf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgeqlf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgeqlf( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgeqlf( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sorgql( lapack_int* m, lapack_int* n, lapack_int* k, float* a, - lapack_int* lda, const float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dorgql( lapack_int* m, lapack_int* n, lapack_int* k, double* a, - lapack_int* lda, const double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cungql( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zungql( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sormql( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const float* a, lapack_int* lda, - const float* tau, float* c, lapack_int* ldc, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dormql( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const double* a, lapack_int* lda, - const double* tau, double* c, lapack_int* ldc, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cunmql( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zunmql( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* tau, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sgerqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgerqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgerqf( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgerqf( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sorgrq( lapack_int* m, lapack_int* n, lapack_int* k, float* a, - lapack_int* lda, const float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dorgrq( lapack_int* m, lapack_int* n, lapack_int* k, double* a, - lapack_int* lda, const double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cungrq( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zungrq( lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sormrq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const float* a, lapack_int* lda, - const float* tau, float* c, lapack_int* ldc, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dormrq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const double* a, lapack_int* lda, - const double* tau, double* c, lapack_int* ldc, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cunmrq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zunmrq( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* tau, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_stzrzf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dtzrzf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ctzrzf( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ztzrzf( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sormrz( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, const float* a, - lapack_int* lda, const float* tau, float* c, - lapack_int* ldc, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dormrz( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, const double* a, - lapack_int* lda, const double* tau, double* c, - lapack_int* ldc, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cunmrz( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zunmrz( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, lapack_complex_double* c, - lapack_int* ldc, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sggqrf( lapack_int* n, lapack_int* m, lapack_int* p, float* a, - lapack_int* lda, float* taua, float* b, lapack_int* ldb, - float* taub, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dggqrf( lapack_int* n, lapack_int* m, lapack_int* p, double* a, - lapack_int* lda, double* taua, double* b, lapack_int* ldb, - double* taub, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cggqrf( lapack_int* n, lapack_int* m, lapack_int* p, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* taua, lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* taub, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zggqrf( lapack_int* n, lapack_int* m, lapack_int* p, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* taua, lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* taub, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sggrqf( lapack_int* m, lapack_int* p, lapack_int* n, float* a, - lapack_int* lda, float* taua, float* b, lapack_int* ldb, - float* taub, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dggrqf( lapack_int* m, lapack_int* p, lapack_int* n, double* a, - lapack_int* lda, double* taua, double* b, lapack_int* ldb, - double* taub, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cggrqf( lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* taua, lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* taub, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zggrqf( lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* taua, lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* taub, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sgebrd( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* d, float* e, float* tauq, float* taup, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dgebrd( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* d, double* e, double* tauq, double* taup, - double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_cgebrd( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, float* d, float* e, - lapack_complex_float* tauq, lapack_complex_float* taup, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgebrd( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, double* d, double* e, - lapack_complex_double* tauq, lapack_complex_double* taup, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, - lapack_int* kl, lapack_int* ku, float* ab, lapack_int* ldab, - float* d, float* e, float* q, lapack_int* ldq, float* pt, - lapack_int* ldpt, float* c, lapack_int* ldc, float* work, - lapack_int *info ); -void LAPACK_dgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, - lapack_int* kl, lapack_int* ku, double* ab, - lapack_int* ldab, double* d, double* e, double* q, - lapack_int* ldq, double* pt, lapack_int* ldpt, double* c, - lapack_int* ldc, double* work, lapack_int *info ); -void LAPACK_cgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, - lapack_int* kl, lapack_int* ku, lapack_complex_float* ab, - lapack_int* ldab, float* d, float* e, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* pt, lapack_int* ldpt, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, - lapack_int* kl, lapack_int* ku, lapack_complex_double* ab, - lapack_int* ldab, double* d, double* e, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* pt, lapack_int* ldpt, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k, - float* a, lapack_int* lda, const float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k, - double* a, lapack_int* lda, const double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sormbr( char* vect, char* side, char* trans, lapack_int* m, - lapack_int* n, lapack_int* k, const float* a, - lapack_int* lda, const float* tau, float* c, - lapack_int* ldc, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dormbr( char* vect, char* side, char* trans, lapack_int* m, - lapack_int* n, lapack_int* k, const double* a, - lapack_int* lda, const double* tau, double* c, - lapack_int* ldc, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k, - lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cunmbr( char* vect, char* side, char* trans, lapack_int* m, - lapack_int* n, lapack_int* k, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zunmbr( char* vect, char* side, char* trans, lapack_int* m, - lapack_int* n, lapack_int* k, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, lapack_complex_double* c, - lapack_int* ldc, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt, - lapack_int* nru, lapack_int* ncc, float* d, float* e, - float* vt, lapack_int* ldvt, float* u, lapack_int* ldu, - float* c, lapack_int* ldc, float* work, lapack_int *info ); -void LAPACK_dbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt, - lapack_int* nru, lapack_int* ncc, double* d, double* e, - double* vt, lapack_int* ldvt, double* u, lapack_int* ldu, - double* c, lapack_int* ldc, double* work, - lapack_int *info ); -void LAPACK_cbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt, - lapack_int* nru, lapack_int* ncc, float* d, float* e, - lapack_complex_float* vt, lapack_int* ldvt, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* c, lapack_int* ldc, float* work, - lapack_int *info ); -void LAPACK_zbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt, - lapack_int* nru, lapack_int* ncc, double* d, double* e, - lapack_complex_double* vt, lapack_int* ldvt, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* c, lapack_int* ldc, double* work, - lapack_int *info ); -void LAPACK_sbdsdc( char* uplo, char* compq, lapack_int* n, float* d, float* e, - float* u, lapack_int* ldu, float* vt, lapack_int* ldvt, - float* q, lapack_int* iq, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dbdsdc( char* uplo, char* compq, lapack_int* n, double* d, - double* e, double* u, lapack_int* ldu, double* vt, - lapack_int* ldvt, double* q, lapack_int* iq, double* work, - lapack_int* iwork, lapack_int *info ); -void LAPACK_sbdsvdx( char* uplo, char* jobz, char* range, - lapack_int* n, float* d, float* e, - float* vl, float* vu, - lapack_int* il, lapack_int* iu, lapack_int* ns, - float* s, float* z, lapack_int* ldz, - float* work, lapack_int *iwork, lapack_int *info ); -void LAPACK_dbdsvdx( char* uplo, char* jobz, char* range, - lapack_int* n, double* d, double* e, - double* vl, double* vu, - lapack_int* il, lapack_int* iu, lapack_int* ns, - double* s, double* z, lapack_int* ldz, - double* work, lapack_int *iwork, lapack_int *info ); -void LAPACK_ssytrd( char* uplo, lapack_int* n, float* a, lapack_int* lda, - float* d, float* e, float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dsytrd( char* uplo, lapack_int* n, double* a, lapack_int* lda, - double* d, double* e, double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sorgtr( char* uplo, lapack_int* n, float* a, lapack_int* lda, - const float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dorgtr( char* uplo, lapack_int* n, double* a, lapack_int* lda, - const double* tau, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sormtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const float* a, lapack_int* lda, - const float* tau, float* c, lapack_int* ldc, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dormtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const double* a, lapack_int* lda, - const double* tau, double* c, lapack_int* ldc, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_chetrd( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, float* d, float* e, - lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zhetrd( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, double* d, double* e, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cungtr( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zungtr( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cunmtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zunmtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* tau, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ssptrd( char* uplo, lapack_int* n, float* ap, float* d, float* e, - float* tau, lapack_int *info ); -void LAPACK_dsptrd( char* uplo, lapack_int* n, double* ap, double* d, double* e, - double* tau, lapack_int *info ); -void LAPACK_sopgtr( char* uplo, lapack_int* n, const float* ap, - const float* tau, float* q, lapack_int* ldq, float* work, - lapack_int *info ); -void LAPACK_dopgtr( char* uplo, lapack_int* n, const double* ap, - const double* tau, double* q, lapack_int* ldq, double* work, - lapack_int *info ); -void LAPACK_sopmtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const float* ap, const float* tau, float* c, - lapack_int* ldc, float* work, lapack_int *info ); -void LAPACK_dopmtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const double* ap, const double* tau, - double* c, lapack_int* ldc, double* work, - lapack_int *info ); -void LAPACK_chptrd( char* uplo, lapack_int* n, lapack_complex_float* ap, - float* d, float* e, lapack_complex_float* tau, - lapack_int *info ); -void LAPACK_zhptrd( char* uplo, lapack_int* n, lapack_complex_double* ap, - double* d, double* e, lapack_complex_double* tau, - lapack_int *info ); -void LAPACK_cupgtr( char* uplo, lapack_int* n, const lapack_complex_float* ap, - const lapack_complex_float* tau, lapack_complex_float* q, - lapack_int* ldq, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zupgtr( char* uplo, lapack_int* n, const lapack_complex_double* ap, - const lapack_complex_double* tau, lapack_complex_double* q, - lapack_int* ldq, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_cupmtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const lapack_complex_float* ap, - const lapack_complex_float* tau, lapack_complex_float* c, - lapack_int* ldc, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zupmtr( char* side, char* uplo, char* trans, lapack_int* m, - lapack_int* n, const lapack_complex_double* ap, - const lapack_complex_double* tau, lapack_complex_double* c, - lapack_int* ldc, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_ssbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd, - float* ab, lapack_int* ldab, float* d, float* e, float* q, - lapack_int* ldq, float* work, lapack_int *info ); -void LAPACK_dsbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd, - double* ab, lapack_int* ldab, double* d, double* e, - double* q, lapack_int* ldq, double* work, - lapack_int *info ); -void LAPACK_chbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_float* ab, lapack_int* ldab, float* d, - float* e, lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zhbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_double* ab, lapack_int* ldab, double* d, - double* e, lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_ssterf( lapack_int* n, float* d, float* e, lapack_int *info ); -void LAPACK_dsterf( lapack_int* n, double* d, double* e, lapack_int *info ); -void LAPACK_ssteqr( char* compz, lapack_int* n, float* d, float* e, float* z, - lapack_int* ldz, float* work, lapack_int *info ); -void LAPACK_dsteqr( char* compz, lapack_int* n, double* d, double* e, double* z, - lapack_int* ldz, double* work, lapack_int *info ); -void LAPACK_csteqr( char* compz, lapack_int* n, float* d, float* e, - lapack_complex_float* z, lapack_int* ldz, float* work, - lapack_int *info ); -void LAPACK_zsteqr( char* compz, lapack_int* n, double* d, double* e, - lapack_complex_double* z, lapack_int* ldz, double* work, - lapack_int *info ); -void LAPACK_sstemr( char* jobz, char* range, lapack_int* n, float* d, float* e, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - lapack_int* m, float* w, float* z, lapack_int* ldz, - lapack_int* nzc, lapack_int* isuppz, lapack_logical* tryrac, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_dstemr( char* jobz, char* range, lapack_int* n, double* d, - double* e, double* vl, double* vu, lapack_int* il, - lapack_int* iu, lapack_int* m, double* w, double* z, - lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz, - lapack_logical* tryrac, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_cstemr( char* jobz, char* range, lapack_int* n, float* d, float* e, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz, - lapack_logical* tryrac, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_zstemr( char* jobz, char* range, lapack_int* n, double* d, - double* e, double* vl, double* vu, lapack_int* il, - lapack_int* iu, lapack_int* m, double* w, - lapack_complex_double* z, lapack_int* ldz, lapack_int* nzc, - lapack_int* isuppz, lapack_logical* tryrac, double* work, - lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_sstedc( char* compz, lapack_int* n, float* d, float* e, float* z, - lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dstedc( char* compz, lapack_int* n, double* d, double* e, double* z, - lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_cstedc( char* compz, lapack_int* n, float* d, float* e, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zstedc( char* compz, lapack_int* n, double* d, double* e, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_sstegr( char* jobz, char* range, lapack_int* n, float* d, float* e, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - float* abstol, lapack_int* m, float* w, float* z, - lapack_int* ldz, lapack_int* isuppz, float* work, - lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_dstegr( char* jobz, char* range, lapack_int* n, double* d, - double* e, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, lapack_int* isuppz, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_cstegr( char* jobz, char* range, lapack_int* n, float* d, float* e, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - float* abstol, lapack_int* m, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_int* isuppz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_zstegr( char* jobz, char* range, lapack_int* n, double* d, - double* e, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_int* isuppz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_spteqr( char* compz, lapack_int* n, float* d, float* e, float* z, - lapack_int* ldz, float* work, lapack_int *info ); -void LAPACK_dpteqr( char* compz, lapack_int* n, double* d, double* e, double* z, - lapack_int* ldz, double* work, lapack_int *info ); -void LAPACK_cpteqr( char* compz, lapack_int* n, float* d, float* e, - lapack_complex_float* z, lapack_int* ldz, float* work, - lapack_int *info ); -void LAPACK_zpteqr( char* compz, lapack_int* n, double* d, double* e, - lapack_complex_double* z, lapack_int* ldz, double* work, - lapack_int *info ); -void LAPACK_sstebz( char* range, char* order, lapack_int* n, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - const float* d, const float* e, lapack_int* m, - lapack_int* nsplit, float* w, lapack_int* iblock, - lapack_int* isplit, float* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dstebz( char* range, char* order, lapack_int* n, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - const double* d, const double* e, lapack_int* m, - lapack_int* nsplit, double* w, lapack_int* iblock, - lapack_int* isplit, double* work, lapack_int* iwork, - lapack_int *info ); -void LAPACK_sstein( lapack_int* n, const float* d, const float* e, - lapack_int* m, const float* w, const lapack_int* iblock, - const lapack_int* isplit, float* z, lapack_int* ldz, - float* work, lapack_int* iwork, lapack_int* ifailv, - lapack_int *info ); -void LAPACK_dstein( lapack_int* n, const double* d, const double* e, - lapack_int* m, const double* w, const lapack_int* iblock, - const lapack_int* isplit, double* z, lapack_int* ldz, - double* work, lapack_int* iwork, lapack_int* ifailv, - lapack_int *info ); -void LAPACK_cstein( lapack_int* n, const float* d, const float* e, - lapack_int* m, const float* w, const lapack_int* iblock, - const lapack_int* isplit, lapack_complex_float* z, - lapack_int* ldz, float* work, lapack_int* iwork, - lapack_int* ifailv, lapack_int *info ); -void LAPACK_zstein( lapack_int* n, const double* d, const double* e, - lapack_int* m, const double* w, const lapack_int* iblock, - const lapack_int* isplit, lapack_complex_double* z, - lapack_int* ldz, double* work, lapack_int* iwork, - lapack_int* ifailv, lapack_int *info ); -void LAPACK_sdisna( char* job, lapack_int* m, lapack_int* n, const float* d, - float* sep, lapack_int *info ); -void LAPACK_ddisna( char* job, lapack_int* m, lapack_int* n, const double* d, - double* sep, lapack_int *info ); -void LAPACK_ssygst( lapack_int* itype, char* uplo, lapack_int* n, float* a, - lapack_int* lda, const float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dsygst( lapack_int* itype, char* uplo, lapack_int* n, double* a, - lapack_int* lda, const double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_chegst( lapack_int* itype, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zhegst( lapack_int* itype, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_sspgst( lapack_int* itype, char* uplo, lapack_int* n, float* ap, - const float* bp, lapack_int *info ); -void LAPACK_dspgst( lapack_int* itype, char* uplo, lapack_int* n, double* ap, - const double* bp, lapack_int *info ); -void LAPACK_chpgst( lapack_int* itype, char* uplo, lapack_int* n, - lapack_complex_float* ap, const lapack_complex_float* bp, - lapack_int *info ); -void LAPACK_zhpgst( lapack_int* itype, char* uplo, lapack_int* n, - lapack_complex_double* ap, const lapack_complex_double* bp, - lapack_int *info ); -void LAPACK_ssbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, float* ab, lapack_int* ldab, - const float* bb, lapack_int* ldbb, float* x, - lapack_int* ldx, float* work, lapack_int *info ); -void LAPACK_dsbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, double* ab, lapack_int* ldab, - const double* bb, lapack_int* ldbb, double* x, - lapack_int* ldx, double* work, lapack_int *info ); -void LAPACK_chbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab, - const lapack_complex_float* bb, lapack_int* ldbb, - lapack_complex_float* x, lapack_int* ldx, - lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zhbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab, - const lapack_complex_double* bb, lapack_int* ldbb, - lapack_complex_double* x, lapack_int* ldx, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_spbstf( char* uplo, lapack_int* n, lapack_int* kb, float* bb, - lapack_int* ldbb, lapack_int *info ); -void LAPACK_dpbstf( char* uplo, lapack_int* n, lapack_int* kb, double* bb, - lapack_int* ldbb, lapack_int *info ); -void LAPACK_cpbstf( char* uplo, lapack_int* n, lapack_int* kb, - lapack_complex_float* bb, lapack_int* ldbb, - lapack_int *info ); -void LAPACK_zpbstf( char* uplo, lapack_int* n, lapack_int* kb, - lapack_complex_double* bb, lapack_int* ldbb, - lapack_int *info ); -void LAPACK_sgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, - lapack_int* lda, float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a, - lapack_int* lda, double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, - lapack_int* lda, const float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a, - lapack_int* lda, const double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sormhr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, const float* a, - lapack_int* lda, const float* tau, float* c, - lapack_int* ldc, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dormhr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, const double* a, - lapack_int* lda, const double* tau, double* c, - lapack_int* ldc, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, - lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, - lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cunmhr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* tau, lapack_complex_float* c, - lapack_int* ldc, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zunmhr( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* tau, lapack_complex_double* c, - lapack_int* ldc, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sgebal( char* job, lapack_int* n, float* a, lapack_int* lda, - lapack_int* ilo, lapack_int* ihi, float* scale, - lapack_int *info ); -void LAPACK_dgebal( char* job, lapack_int* n, double* a, lapack_int* lda, - lapack_int* ilo, lapack_int* ihi, double* scale, - lapack_int *info ); -void LAPACK_cgebal( char* job, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ilo, lapack_int* ihi, - float* scale, lapack_int *info ); -void LAPACK_zgebal( char* job, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ilo, lapack_int* ihi, - double* scale, lapack_int *info ); -void LAPACK_sgebak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const float* scale, lapack_int* m, - float* v, lapack_int* ldv, lapack_int *info ); -void LAPACK_dgebak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const double* scale, lapack_int* m, - double* v, lapack_int* ldv, lapack_int *info ); -void LAPACK_cgebak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const float* scale, lapack_int* m, - lapack_complex_float* v, lapack_int* ldv, - lapack_int *info ); -void LAPACK_zgebak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const double* scale, lapack_int* m, - lapack_complex_double* v, lapack_int* ldv, - lapack_int *info ); -void LAPACK_shseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, float* h, lapack_int* ldh, float* wr, - float* wi, float* z, lapack_int* ldz, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, double* h, lapack_int* ldh, double* wr, - double* wi, double* z, lapack_int* ldz, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_chseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, lapack_complex_float* h, lapack_int* ldh, - lapack_complex_float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, lapack_complex_double* h, lapack_int* ldh, - lapack_complex_double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_shsein( char* job, char* eigsrc, char* initv, - lapack_logical* select, lapack_int* n, const float* h, - lapack_int* ldh, float* wr, const float* wi, float* vl, - lapack_int* ldvl, float* vr, lapack_int* ldvr, - lapack_int* mm, lapack_int* m, float* work, - lapack_int* ifaill, lapack_int* ifailr, lapack_int *info ); -void LAPACK_dhsein( char* job, char* eigsrc, char* initv, - lapack_logical* select, lapack_int* n, const double* h, - lapack_int* ldh, double* wr, const double* wi, double* vl, - lapack_int* ldvl, double* vr, lapack_int* ldvr, - lapack_int* mm, lapack_int* m, double* work, - lapack_int* ifaill, lapack_int* ifailr, lapack_int *info ); -void LAPACK_chsein( char* job, char* eigsrc, char* initv, - const lapack_logical* select, lapack_int* n, - const lapack_complex_float* h, lapack_int* ldh, - lapack_complex_float* w, lapack_complex_float* vl, - lapack_int* ldvl, lapack_complex_float* vr, - lapack_int* ldvr, lapack_int* mm, lapack_int* m, - lapack_complex_float* work, float* rwork, - lapack_int* ifaill, lapack_int* ifailr, lapack_int *info ); -void LAPACK_zhsein( char* job, char* eigsrc, char* initv, - const lapack_logical* select, lapack_int* n, - const lapack_complex_double* h, lapack_int* ldh, - lapack_complex_double* w, lapack_complex_double* vl, - lapack_int* ldvl, lapack_complex_double* vr, - lapack_int* ldvr, lapack_int* mm, lapack_int* m, - lapack_complex_double* work, double* rwork, - lapack_int* ifaill, lapack_int* ifailr, lapack_int *info ); -void LAPACK_strevc( char* side, char* howmny, lapack_logical* select, - lapack_int* n, const float* t, lapack_int* ldt, float* vl, - lapack_int* ldvl, float* vr, lapack_int* ldvr, - lapack_int* mm, lapack_int* m, float* work, - lapack_int *info ); -void LAPACK_dtrevc( char* side, char* howmny, lapack_logical* select, - lapack_int* n, const double* t, lapack_int* ldt, double* vl, - lapack_int* ldvl, double* vr, lapack_int* ldvr, - lapack_int* mm, lapack_int* m, double* work, - lapack_int *info ); -void LAPACK_ctrevc( char* side, char* howmny, const lapack_logical* select, - lapack_int* n, lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* vl, lapack_int* ldvl, - lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm, - lapack_int* m, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ztrevc( char* side, char* howmny, const lapack_logical* select, - lapack_int* n, lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* vl, lapack_int* ldvl, - lapack_complex_double* vr, lapack_int* ldvr, lapack_int* mm, - lapack_int* m, lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_strsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const float* t, lapack_int* ldt, - const float* vl, lapack_int* ldvl, const float* vr, - lapack_int* ldvr, float* s, float* sep, lapack_int* mm, - lapack_int* m, float* work, lapack_int* ldwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dtrsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const double* t, lapack_int* ldt, - const double* vl, lapack_int* ldvl, const double* vr, - lapack_int* ldvr, double* s, double* sep, lapack_int* mm, - lapack_int* m, double* work, lapack_int* ldwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_ctrsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const lapack_complex_float* t, - lapack_int* ldt, const lapack_complex_float* vl, - lapack_int* ldvl, const lapack_complex_float* vr, - lapack_int* ldvr, float* s, float* sep, lapack_int* mm, - lapack_int* m, lapack_complex_float* work, - lapack_int* ldwork, float* rwork, lapack_int *info ); -void LAPACK_ztrsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const lapack_complex_double* t, - lapack_int* ldt, const lapack_complex_double* vl, - lapack_int* ldvl, const lapack_complex_double* vr, - lapack_int* ldvr, double* s, double* sep, lapack_int* mm, - lapack_int* m, lapack_complex_double* work, - lapack_int* ldwork, double* rwork, lapack_int *info ); -void LAPACK_strexc( char* compq, lapack_int* n, float* t, lapack_int* ldt, - float* q, lapack_int* ldq, lapack_int* ifst, - lapack_int* ilst, float* work, lapack_int *info ); -void LAPACK_dtrexc( char* compq, lapack_int* n, double* t, lapack_int* ldt, - double* q, lapack_int* ldq, lapack_int* ifst, - lapack_int* ilst, double* work, lapack_int *info ); -void LAPACK_ctrexc( char* compq, lapack_int* n, lapack_complex_float* t, - lapack_int* ldt, lapack_complex_float* q, lapack_int* ldq, - lapack_int* ifst, lapack_int* ilst, lapack_int *info ); -void LAPACK_ztrexc( char* compq, lapack_int* n, lapack_complex_double* t, - lapack_int* ldt, lapack_complex_double* q, lapack_int* ldq, - lapack_int* ifst, lapack_int* ilst, lapack_int *info ); -void LAPACK_strsen( char* job, char* compq, const lapack_logical* select, - lapack_int* n, float* t, lapack_int* ldt, float* q, - lapack_int* ldq, float* wr, float* wi, lapack_int* m, - float* s, float* sep, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dtrsen( char* job, char* compq, const lapack_logical* select, - lapack_int* n, double* t, lapack_int* ldt, double* q, - lapack_int* ldq, double* wr, double* wi, lapack_int* m, - double* s, double* sep, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_ctrsen( char* job, char* compq, const lapack_logical* select, - lapack_int* n, lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* w, lapack_int* m, float* s, - float* sep, lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ztrsen( char* job, char* compq, const lapack_logical* select, - lapack_int* n, lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* w, lapack_int* m, double* s, - double* sep, lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_strsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m, - lapack_int* n, const float* a, lapack_int* lda, - const float* b, lapack_int* ldb, float* c, lapack_int* ldc, - float* scale, lapack_int *info ); -void LAPACK_dtrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m, - lapack_int* n, const double* a, lapack_int* lda, - const double* b, lapack_int* ldb, double* c, - lapack_int* ldc, double* scale, lapack_int *info ); -void LAPACK_ctrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m, - lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* c, lapack_int* ldc, - float* scale, lapack_int *info ); -void LAPACK_ztrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m, - lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* c, lapack_int* ldc, - double* scale, lapack_int *info ); -void LAPACK_sgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, float* a, lapack_int* lda, float* b, - lapack_int* ldb, float* q, lapack_int* ldq, float* z, - lapack_int* ldz, lapack_int *info ); -void LAPACK_dgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, double* a, lapack_int* lda, double* b, - lapack_int* ldb, double* q, lapack_int* ldq, double* z, - lapack_int* ldz, lapack_int *info ); -void LAPACK_cgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* z, lapack_int* ldz, - lapack_int *info ); -void LAPACK_zgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* z, lapack_int* ldz, - lapack_int *info ); -void LAPACK_sgghd3( char* compq, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, float* a, lapack_int* lda, float* b, - lapack_int* ldb, float* q, lapack_int* ldq, float* z, - lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgghd3( char* compq, char* compz, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, double* a, lapack_int* lda, double* b, - lapack_int* ldb, double* q, lapack_int* ldq, double* z, - lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgghd3( char* compq, char* compz, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgghd3( char* compq, char* compz, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sggbal( char* job, lapack_int* n, float* a, lapack_int* lda, - float* b, lapack_int* ldb, lapack_int* ilo, lapack_int* ihi, - float* lscale, float* rscale, float* work, - lapack_int *info ); -void LAPACK_dggbal( char* job, lapack_int* n, double* a, lapack_int* lda, - double* b, lapack_int* ldb, lapack_int* ilo, - lapack_int* ihi, double* lscale, double* rscale, - double* work, lapack_int *info ); -void LAPACK_cggbal( char* job, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, - lapack_int* ilo, lapack_int* ihi, float* lscale, - float* rscale, float* work, lapack_int *info ); -void LAPACK_zggbal( char* job, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, - lapack_int* ilo, lapack_int* ihi, double* lscale, - double* rscale, double* work, lapack_int *info ); -void LAPACK_sggbak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const float* lscale, const float* rscale, - lapack_int* m, float* v, lapack_int* ldv, - lapack_int *info ); -void LAPACK_dggbak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const double* lscale, const double* rscale, - lapack_int* m, double* v, lapack_int* ldv, - lapack_int *info ); -void LAPACK_cggbak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const float* lscale, const float* rscale, - lapack_int* m, lapack_complex_float* v, lapack_int* ldv, - lapack_int *info ); -void LAPACK_zggbak( char* job, char* side, lapack_int* n, lapack_int* ilo, - lapack_int* ihi, const double* lscale, const double* rscale, - lapack_int* m, lapack_complex_double* v, lapack_int* ldv, - lapack_int *info ); -void LAPACK_shgeqz( char* job, char* compq, char* compz, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, float* h, lapack_int* ldh, - float* t, lapack_int* ldt, float* alphar, float* alphai, - float* beta, float* q, lapack_int* ldq, float* z, - lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dhgeqz( char* job, char* compq, char* compz, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, double* h, - lapack_int* ldh, double* t, lapack_int* ldt, double* alphar, - double* alphai, double* beta, double* q, lapack_int* ldq, - double* z, lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chgeqz( char* job, char* compq, char* compz, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, lapack_complex_float* h, - lapack_int* ldh, lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zhgeqz( char* job, char* compq, char* compz, lapack_int* n, - lapack_int* ilo, lapack_int* ihi, lapack_complex_double* h, - lapack_int* ldh, lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_stgevc( char* side, char* howmny, const lapack_logical* select, - lapack_int* n, const float* s, lapack_int* lds, - const float* p, lapack_int* ldp, float* vl, - lapack_int* ldvl, float* vr, lapack_int* ldvr, - lapack_int* mm, lapack_int* m, float* work, - lapack_int *info ); -void LAPACK_dtgevc( char* side, char* howmny, const lapack_logical* select, - lapack_int* n, const double* s, lapack_int* lds, - const double* p, lapack_int* ldp, double* vl, - lapack_int* ldvl, double* vr, lapack_int* ldvr, - lapack_int* mm, lapack_int* m, double* work, - lapack_int *info ); -void LAPACK_ctgevc( char* side, char* howmny, const lapack_logical* select, - lapack_int* n, const lapack_complex_float* s, - lapack_int* lds, const lapack_complex_float* p, - lapack_int* ldp, lapack_complex_float* vl, lapack_int* ldvl, - lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm, - lapack_int* m, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_ztgevc( char* side, char* howmny, const lapack_logical* select, - lapack_int* n, const lapack_complex_double* s, - lapack_int* lds, const lapack_complex_double* p, - lapack_int* ldp, lapack_complex_double* vl, - lapack_int* ldvl, lapack_complex_double* vr, - lapack_int* ldvr, lapack_int* mm, lapack_int* m, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_stgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* q, lapack_int* ldq, float* z, lapack_int* ldz, - lapack_int* ifst, lapack_int* ilst, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dtgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* q, lapack_int* ldq, double* z, lapack_int* ldz, - lapack_int* ifst, lapack_int* ilst, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_ctgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* z, lapack_int* ldz, lapack_int* ifst, - lapack_int* ilst, lapack_int *info ); -void LAPACK_ztgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* z, lapack_int* ldz, lapack_int* ifst, - lapack_int* ilst, lapack_int *info ); -void LAPACK_stgsen( lapack_int* ijob, lapack_logical* wantq, - lapack_logical* wantz, const lapack_logical* select, - lapack_int* n, float* a, lapack_int* lda, float* b, - lapack_int* ldb, float* alphar, float* alphai, float* beta, - float* q, lapack_int* ldq, float* z, lapack_int* ldz, - lapack_int* m, float* pl, float* pr, float* dif, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_dtgsen( lapack_int* ijob, lapack_logical* wantq, - lapack_logical* wantz, const lapack_logical* select, - lapack_int* n, double* a, lapack_int* lda, double* b, - lapack_int* ldb, double* alphar, double* alphai, - double* beta, double* q, lapack_int* ldq, double* z, - lapack_int* ldz, lapack_int* m, double* pl, double* pr, - double* dif, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_ctgsen( lapack_int* ijob, lapack_logical* wantq, - lapack_logical* wantz, const lapack_logical* select, - lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* z, lapack_int* ldz, lapack_int* m, - float* pl, float* pr, float* dif, - lapack_complex_float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_ztgsen( lapack_int* ijob, lapack_logical* wantq, - lapack_logical* wantz, const lapack_logical* select, - lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* z, lapack_int* ldz, lapack_int* m, - double* pl, double* pr, double* dif, - lapack_complex_double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_stgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, - const float* a, lapack_int* lda, const float* b, - lapack_int* ldb, float* c, lapack_int* ldc, const float* d, - lapack_int* ldd, const float* e, lapack_int* lde, float* f, - lapack_int* ldf, float* scale, float* dif, float* work, - lapack_int* lwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_dtgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, - const double* a, lapack_int* lda, const double* b, - lapack_int* ldb, double* c, lapack_int* ldc, - const double* d, lapack_int* ldd, const double* e, - lapack_int* lde, double* f, lapack_int* ldf, double* scale, - double* dif, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_ctgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* c, lapack_int* ldc, - const lapack_complex_float* d, lapack_int* ldd, - const lapack_complex_float* e, lapack_int* lde, - lapack_complex_float* f, lapack_int* ldf, float* scale, - float* dif, lapack_complex_float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_ztgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* c, lapack_int* ldc, - const lapack_complex_double* d, lapack_int* ldd, - const lapack_complex_double* e, lapack_int* lde, - lapack_complex_double* f, lapack_int* ldf, double* scale, - double* dif, lapack_complex_double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_stgsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const float* a, lapack_int* lda, - const float* b, lapack_int* ldb, const float* vl, - lapack_int* ldvl, const float* vr, lapack_int* ldvr, - float* s, float* dif, lapack_int* mm, lapack_int* m, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dtgsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const double* a, lapack_int* lda, - const double* b, lapack_int* ldb, const double* vl, - lapack_int* ldvl, const double* vr, lapack_int* ldvr, - double* s, double* dif, lapack_int* mm, lapack_int* m, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int *info ); -void LAPACK_ctgsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* b, - lapack_int* ldb, const lapack_complex_float* vl, - lapack_int* ldvl, const lapack_complex_float* vr, - lapack_int* ldvr, float* s, float* dif, lapack_int* mm, - lapack_int* m, lapack_complex_float* work, - lapack_int* lwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_ztgsna( char* job, char* howmny, const lapack_logical* select, - lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* b, - lapack_int* ldb, const lapack_complex_double* vl, - lapack_int* ldvl, const lapack_complex_double* vr, - lapack_int* ldvr, double* s, double* dif, lapack_int* mm, - lapack_int* m, lapack_complex_double* work, - lapack_int* lwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_sggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, float* a, lapack_int* lda, - float* b, lapack_int* ldb, float* tola, float* tolb, - lapack_int* k, lapack_int* l, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* q, lapack_int* ldq, - lapack_int* iwork, float* tau, float* work, - lapack_int *info ); -void LAPACK_dggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, double* a, lapack_int* lda, - double* b, lapack_int* ldb, double* tola, double* tolb, - lapack_int* k, lapack_int* l, double* u, lapack_int* ldu, - double* v, lapack_int* ldv, double* q, lapack_int* ldq, - lapack_int* iwork, double* tau, double* work, - lapack_int *info ); -void LAPACK_cggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, - float* tola, float* tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, lapack_int* iwork, - float* rwork, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, - double* tola, double* tolb, lapack_int* k, lapack_int* l, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, - lapack_int* iwork, double* rwork, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_sggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, float* a, lapack_int* lda, - float* b, lapack_int* ldb, float* tola, float* tolb, - lapack_int* k, lapack_int* l, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* q, lapack_int* ldq, - lapack_int* iwork, float* tau, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, double* a, lapack_int* lda, - double* b, lapack_int* ldb, double* tola, double* tolb, - lapack_int* k, lapack_int* l, double* u, lapack_int* ldu, - double* v, lapack_int* ldv, double* q, lapack_int* ldq, - lapack_int* iwork, double* tau, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, - float* tola, float* tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, lapack_int* iwork, - float* rwork, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, - double* tola, double* tolb, lapack_int* k, lapack_int* l, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, - lapack_int* iwork, double* rwork, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_stgsja( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* tola, float* tolb, float* alpha, float* beta, - float* u, lapack_int* ldu, float* v, lapack_int* ldv, - float* q, lapack_int* ldq, float* work, lapack_int* ncycle, - lapack_int *info ); -void LAPACK_dtgsja( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* tola, double* tolb, double* alpha, double* beta, - double* u, lapack_int* ldu, double* v, lapack_int* ldv, - double* q, lapack_int* ldq, double* work, - lapack_int* ncycle, lapack_int *info ); -void LAPACK_ctgsja( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* tola, - float* tolb, float* alpha, float* beta, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* work, lapack_int* ncycle, - lapack_int *info ); -void LAPACK_ztgsja( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* tola, - double* tolb, double* alpha, double* beta, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* work, lapack_int* ncycle, - lapack_int *info ); -void LAPACK_sgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_cgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_sgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, float* b, lapack_int* ldb, - lapack_int* jpvt, float* rcond, lapack_int* rank, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* b, lapack_int* ldb, - lapack_int* jpvt, double* rcond, lapack_int* rank, - double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_cgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, lapack_int* jpvt, - float* rcond, lapack_int* rank, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int *info ); -void LAPACK_zgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, lapack_int* jpvt, - double* rcond, lapack_int* rank, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_sgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* s, - float* rcond, lapack_int* rank, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* b, lapack_int* ldb, double* s, - double* rcond, lapack_int* rank, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* s, - float* rcond, lapack_int* rank, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int *info ); -void LAPACK_zgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* s, - double* rcond, lapack_int* rank, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_sgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* s, - float* rcond, lapack_int* rank, float* work, - lapack_int* lwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_dgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* b, lapack_int* ldb, double* s, - double* rcond, lapack_int* rank, double* work, - lapack_int* lwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_cgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* s, - float* rcond, lapack_int* rank, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int* iwork, - lapack_int *info ); -void LAPACK_zgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* s, - double* rcond, lapack_int* rank, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_sgglse( lapack_int* m, lapack_int* n, lapack_int* p, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* c, - float* d, float* x, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgglse( lapack_int* m, lapack_int* n, lapack_int* p, double* a, - lapack_int* lda, double* b, lapack_int* ldb, double* c, - double* d, double* x, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgglse( lapack_int* m, lapack_int* n, lapack_int* p, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* c, lapack_complex_float* d, - lapack_complex_float* x, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zgglse( lapack_int* m, lapack_int* n, lapack_int* p, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* c, lapack_complex_double* d, - lapack_complex_double* x, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sggglm( lapack_int* n, lapack_int* m, lapack_int* p, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* d, - float* x, float* y, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dggglm( lapack_int* n, lapack_int* m, lapack_int* p, double* a, - lapack_int* lda, double* b, lapack_int* ldb, double* d, - double* x, double* y, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cggglm( lapack_int* n, lapack_int* m, lapack_int* p, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* d, lapack_complex_float* x, - lapack_complex_float* y, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zggglm( lapack_int* n, lapack_int* m, lapack_int* p, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* d, lapack_complex_double* x, - lapack_complex_double* y, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_ssyev( char* jobz, char* uplo, lapack_int* n, float* a, - lapack_int* lda, float* w, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dsyev( char* jobz, char* uplo, lapack_int* n, double* a, - lapack_int* lda, double* w, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cheev( char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* w, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zheev( char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* w, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_ssyevd( char* jobz, char* uplo, lapack_int* n, float* a, - lapack_int* lda, float* w, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dsyevd( char* jobz, char* uplo, lapack_int* n, double* a, - lapack_int* lda, double* w, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_cheevd( char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* w, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zheevd( char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* w, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_ssyevx( char* jobz, char* range, char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* vl, float* vu, - lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, float* z, lapack_int* ldz, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_dsyevx( char* jobz, char* range, char* uplo, lapack_int* n, - double* a, lapack_int* lda, double* vl, double* vu, - lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, double* z, lapack_int* ldz, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_cheevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_zheevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_ssyevr( char* jobz, char* range, char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* vl, float* vu, - lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, float* z, lapack_int* ldz, - lapack_int* isuppz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dsyevr( char* jobz, char* range, char* uplo, lapack_int* n, - double* a, lapack_int* lda, double* vl, double* vu, - lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, double* z, lapack_int* ldz, - lapack_int* isuppz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_cheevr( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_int* isuppz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zheevr( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_int* isuppz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_sspev( char* jobz, char* uplo, lapack_int* n, float* ap, float* w, - float* z, lapack_int* ldz, float* work, lapack_int *info ); -void LAPACK_dspev( char* jobz, char* uplo, lapack_int* n, double* ap, double* w, - double* z, lapack_int* ldz, double* work, lapack_int *info ); -void LAPACK_chpev( char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* ap, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, float* rwork, - lapack_int *info ); -void LAPACK_zhpev( char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* ap, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sspevd( char* jobz, char* uplo, lapack_int* n, float* ap, float* w, - float* z, lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dspevd( char* jobz, char* uplo, lapack_int* n, double* ap, - double* w, double* z, lapack_int* ldz, double* work, - lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_chpevd( char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* ap, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int* lrwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_zhpevd( char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* ap, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_sspevx( char* jobz, char* range, char* uplo, lapack_int* n, - float* ap, float* vl, float* vu, lapack_int* il, - lapack_int* iu, float* abstol, lapack_int* m, float* w, - float* z, lapack_int* ldz, float* work, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_dspevx( char* jobz, char* range, char* uplo, lapack_int* n, - double* ap, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, double* work, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_chpevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_float* ap, float* vl, float* vu, - lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, float* rwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_zhpevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_double* ap, double* vl, double* vu, - lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, double* rwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_ssbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - float* ab, lapack_int* ldab, float* w, float* z, - lapack_int* ldz, float* work, lapack_int *info ); -void LAPACK_dsbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - double* ab, lapack_int* ldab, double* w, double* z, - lapack_int* ldz, double* work, lapack_int *info ); -void LAPACK_chbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_float* ab, lapack_int* ldab, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, float* rwork, lapack_int *info ); -void LAPACK_zhbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_double* ab, lapack_int* ldab, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_ssbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - float* ab, lapack_int* ldab, float* w, float* z, - lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dsbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - double* ab, lapack_int* ldab, double* w, double* z, - lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_chbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_float* ab, lapack_int* ldab, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zhbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_double* ab, lapack_int* ldab, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_ssbevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, float* ab, lapack_int* ldab, float* q, - lapack_int* ldq, float* vl, float* vu, lapack_int* il, - lapack_int* iu, float* abstol, lapack_int* m, float* w, - float* z, lapack_int* ldz, float* work, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_dsbevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, double* ab, lapack_int* ldab, double* q, - lapack_int* ldq, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, double* work, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_chbevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab, - lapack_complex_float* q, lapack_int* ldq, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, - float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_zhbevx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* q, lapack_int* ldq, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, - double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_sstev( char* jobz, lapack_int* n, float* d, float* e, float* z, - lapack_int* ldz, float* work, lapack_int *info ); -void LAPACK_dstev( char* jobz, lapack_int* n, double* d, double* e, double* z, - lapack_int* ldz, double* work, lapack_int *info ); -void LAPACK_sstevd( char* jobz, lapack_int* n, float* d, float* e, float* z, - lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dstevd( char* jobz, lapack_int* n, double* d, double* e, double* z, - lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_sstevx( char* jobz, char* range, lapack_int* n, float* d, float* e, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - float* abstol, lapack_int* m, float* w, float* z, - lapack_int* ldz, float* work, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_dstevx( char* jobz, char* range, lapack_int* n, double* d, - double* e, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, double* work, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_sstevr( char* jobz, char* range, lapack_int* n, float* d, float* e, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - float* abstol, lapack_int* m, float* w, float* z, - lapack_int* ldz, lapack_int* isuppz, float* work, - lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_dstevr( char* jobz, char* range, lapack_int* n, double* d, - double* e, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, lapack_int* isuppz, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_sgees( char* jobvs, char* sort, LAPACK_S_SELECT2 select, - lapack_int* n, float* a, lapack_int* lda, lapack_int* sdim, - float* wr, float* wi, float* vs, lapack_int* ldvs, - float* work, lapack_int* lwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_dgees( char* jobvs, char* sort, LAPACK_D_SELECT2 select, - lapack_int* n, double* a, lapack_int* lda, lapack_int* sdim, - double* wr, double* wi, double* vs, lapack_int* ldvs, - double* work, lapack_int* lwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_cgees( char* jobvs, char* sort, LAPACK_C_SELECT1 select, - lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_int* sdim, lapack_complex_float* w, - lapack_complex_float* vs, lapack_int* ldvs, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_logical* bwork, lapack_int *info ); -void LAPACK_zgees( char* jobvs, char* sort, LAPACK_Z_SELECT1 select, - lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_int* sdim, lapack_complex_double* w, - lapack_complex_double* vs, lapack_int* ldvs, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_logical* bwork, lapack_int *info ); -void LAPACK_sgeesx( char* jobvs, char* sort, LAPACK_S_SELECT2 select, - char* sense, lapack_int* n, float* a, lapack_int* lda, - lapack_int* sdim, float* wr, float* wi, float* vs, - lapack_int* ldvs, float* rconde, float* rcondv, float* work, - lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, - lapack_logical* bwork, lapack_int *info ); -void LAPACK_dgeesx( char* jobvs, char* sort, LAPACK_D_SELECT2 select, - char* sense, lapack_int* n, double* a, lapack_int* lda, - lapack_int* sdim, double* wr, double* wi, double* vs, - lapack_int* ldvs, double* rconde, double* rcondv, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_cgeesx( char* jobvs, char* sort, LAPACK_C_SELECT1 select, - char* sense, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* sdim, lapack_complex_float* w, - lapack_complex_float* vs, lapack_int* ldvs, float* rconde, - float* rcondv, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_zgeesx( char* jobvs, char* sort, LAPACK_Z_SELECT1 select, - char* sense, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* sdim, lapack_complex_double* w, - lapack_complex_double* vs, lapack_int* ldvs, double* rconde, - double* rcondv, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_sgeev( char* jobvl, char* jobvr, lapack_int* n, float* a, - lapack_int* lda, float* wr, float* wi, float* vl, - lapack_int* ldvl, float* vr, lapack_int* ldvr, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dgeev( char* jobvl, char* jobvr, lapack_int* n, double* a, - lapack_int* lda, double* wr, double* wi, double* vl, - lapack_int* ldvl, double* vr, lapack_int* ldvr, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cgeev( char* jobvl, char* jobvr, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* w, lapack_complex_float* vl, - lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zgeev( char* jobvl, char* jobvr, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* w, lapack_complex_double* vl, - lapack_int* ldvl, lapack_complex_double* vr, - lapack_int* ldvr, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int *info ); -void LAPACK_sgeevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, float* a, lapack_int* lda, float* wr, - float* wi, float* vl, lapack_int* ldvl, float* vr, - lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, - float* scale, float* abnrm, float* rconde, float* rcondv, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dgeevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, double* a, lapack_int* lda, double* wr, - double* wi, double* vl, lapack_int* ldvl, double* vr, - lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, - double* scale, double* abnrm, double* rconde, - double* rcondv, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_cgeevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* w, lapack_complex_float* vl, - lapack_int* ldvl, lapack_complex_float* vr, - lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, - float* scale, float* abnrm, float* rconde, float* rcondv, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zgeevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* w, lapack_complex_double* vl, - lapack_int* ldvl, lapack_complex_double* vr, - lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, - double* scale, double* abnrm, double* rconde, - double* rcondv, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int *info ); -void LAPACK_sgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n, - float* a, lapack_int* lda, float* s, float* u, - lapack_int* ldu, float* vt, lapack_int* ldvt, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_dgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n, - double* a, lapack_int* lda, double* s, double* u, - lapack_int* ldu, double* vt, lapack_int* ldvt, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* s, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* vt, lapack_int* ldvt, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* s, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* vt, lapack_int* ldvt, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_sgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n, - float* a, lapack_int* lda, float* vl, float* vu, - lapack_int* il, lapack_int* iu, lapack_int* ns, float* s, float* u, - lapack_int* ldu, float* vt, lapack_int* ldvt, float* work, - lapack_int* lwork, lapack_int *iwork, lapack_int *info ); -void LAPACK_dgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n, - double* a, lapack_int* lda, double* vl, double* vu, - lapack_int* il, lapack_int* iu, lapack_int* ns, double* s, double* u, - lapack_int* ldu, double* vt, lapack_int* ldvt, double* work, - lapack_int* lwork, lapack_int *iwork, lapack_int *info ); -void LAPACK_cgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* vl, float* vu, - lapack_int* il, lapack_int* iu, lapack_int* ns, float* s, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* vt, lapack_int* ldvt, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *iwork, lapack_int *info ); -void LAPACK_zgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* vl, double* vu, - lapack_int* il, lapack_int* iu, lapack_int* ns, double* s, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* vt, lapack_int* ldvt, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *iwork, lapack_int *info ); -void LAPACK_sgesdd( char* jobz, lapack_int* m, lapack_int* n, float* a, - lapack_int* lda, float* s, float* u, lapack_int* ldu, - float* vt, lapack_int* ldvt, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dgesdd( char* jobz, lapack_int* m, lapack_int* n, double* a, - lapack_int* lda, double* s, double* u, lapack_int* ldu, - double* vt, lapack_int* ldvt, double* work, - lapack_int* lwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_cgesdd( char* jobz, lapack_int* m, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* s, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* vt, lapack_int* ldvt, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_zgesdd( char* jobz, lapack_int* m, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* s, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* vt, lapack_int* ldvt, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_dgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt, - char* jobp, lapack_int* m, lapack_int* n, double* a, - lapack_int* lda, double* sva, double* u, lapack_int* ldu, - double* v, lapack_int* ldv, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_sgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt, - char* jobp, lapack_int* m, lapack_int* n, float* a, - lapack_int* lda, float* sva, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_cgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt, - char* jobp, lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, float* sva, lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* cwork, - lapack_int* lwork, float* work, lapack_int* lrwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_zgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt, - char* jobp, lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, double* sva, lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* cwork, - lapack_int* lwork, double* work, lapack_int* lrwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_dgesvj( char* joba, char* jobu, char* jobv, lapack_int* m, - lapack_int* n, double* a, lapack_int* lda, double* sva, - lapack_int* mv, double* v, lapack_int* ldv, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_sgesvj( char* joba, char* jobu, char* jobv, lapack_int* m, - lapack_int* n, float* a, lapack_int* lda, float* sva, - lapack_int* mv, float* v, lapack_int* ldv, float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cgesvj( char* joba, char* jobu, char* jobv, lapack_int* m, - lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* sva, - lapack_int* mv, lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* cwork, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int *info ); -void LAPACK_zgesvj( char* joba, char* jobu, char* jobv, lapack_int* m, - lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* sva, - lapack_int* mv, lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* cwork, lapack_int* lwork, double* rwork, - lapack_int* lrwork, lapack_int *info ); -void LAPACK_sggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* alpha, float* beta, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* q, lapack_int* ldq, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* alpha, double* beta, double* u, lapack_int* ldu, - double* v, lapack_int* ldv, double* q, lapack_int* ldq, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_cggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* alpha, - float* beta, lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* work, float* rwork, lapack_int* iwork, - lapack_int *info ); -void LAPACK_zggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* alpha, - double* beta, lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* work, double* rwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_sggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* alpha, float* beta, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* q, lapack_int* ldq, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int *info ); -void LAPACK_dggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* alpha, double* beta, double* u, lapack_int* ldu, - double* v, lapack_int* ldv, double* q, lapack_int* ldq, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int *info ); -void LAPACK_cggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* alpha, - float* beta, lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* iwork, lapack_int *info ); -void LAPACK_zggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m, - lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* alpha, - double* beta, lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* iwork, lapack_int *info ); -void LAPACK_ssygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* w, float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* w, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* w, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zhegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* w, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_ssygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* w, float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_dsygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* w, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_chegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* w, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zhegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* w, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_ssygvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, float* a, lapack_int* lda, float* b, - lapack_int* ldb, float* vl, float* vu, lapack_int* il, - lapack_int* iu, float* abstol, lapack_int* m, float* w, - float* z, lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_dsygvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, double* a, lapack_int* lda, double* b, - lapack_int* ldb, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_chegvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_zhegvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_sspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - float* ap, float* bp, float* w, float* z, lapack_int* ldz, - float* work, lapack_int *info ); -void LAPACK_dspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - double* ap, double* bp, double* w, double* z, - lapack_int* ldz, double* work, lapack_int *info ); -void LAPACK_chpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* ap, lapack_complex_float* bp, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, float* rwork, lapack_int *info ); -void LAPACK_zhpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* ap, lapack_complex_double* bp, - double* w, lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_sspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - float* ap, float* bp, float* w, float* z, lapack_int* ldz, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_dspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - double* ap, double* bp, double* w, double* z, - lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_chpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* ap, lapack_complex_float* bp, - float* w, lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zhpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* ap, lapack_complex_double* bp, - double* w, lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_sspgvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, float* ap, float* bp, float* vl, float* vu, - lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, float* z, lapack_int* ldz, - float* work, lapack_int* iwork, lapack_int* ifail, - lapack_int *info ); -void LAPACK_dspgvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, double* ap, double* bp, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, double* z, lapack_int* ldz, - double* work, lapack_int* iwork, lapack_int* ifail, - lapack_int *info ); -void LAPACK_chpgvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, lapack_complex_float* ap, - lapack_complex_float* bp, float* vl, float* vu, - lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, float* rwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_zhpgvx( lapack_int* itype, char* jobz, char* range, char* uplo, - lapack_int* n, lapack_complex_double* ap, - lapack_complex_double* bp, double* vl, double* vu, - lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, double* rwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_ssbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, float* ab, lapack_int* ldab, float* bb, - lapack_int* ldbb, float* w, float* z, lapack_int* ldz, - float* work, lapack_int *info ); -void LAPACK_dsbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, double* ab, lapack_int* ldab, double* bb, - lapack_int* ldbb, double* w, double* z, lapack_int* ldz, - double* work, lapack_int *info ); -void LAPACK_chbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab, - lapack_complex_float* bb, lapack_int* ldbb, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, float* rwork, lapack_int *info ); -void LAPACK_zhbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* bb, lapack_int* ldbb, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, double* rwork, - lapack_int *info ); -void LAPACK_ssbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, float* ab, lapack_int* ldab, float* bb, - lapack_int* ldbb, float* w, float* z, lapack_int* ldz, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_dsbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, double* ab, lapack_int* ldab, double* bb, - lapack_int* ldbb, double* w, double* z, lapack_int* ldz, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_chbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab, - lapack_complex_float* bb, lapack_int* ldbb, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zhbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka, - lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* bb, lapack_int* ldbb, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_ssbgvx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab, - float* bb, lapack_int* ldbb, float* q, lapack_int* ldq, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - float* abstol, lapack_int* m, float* w, float* z, - lapack_int* ldz, float* work, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_dsbgvx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* ka, lapack_int* kb, double* ab, - lapack_int* ldab, double* bb, lapack_int* ldbb, double* q, - lapack_int* ldq, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, double* work, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_chbgvx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* ka, lapack_int* kb, lapack_complex_float* ab, - lapack_int* ldab, lapack_complex_float* bb, - lapack_int* ldbb, lapack_complex_float* q, lapack_int* ldq, - float* vl, float* vu, lapack_int* il, lapack_int* iu, - float* abstol, lapack_int* m, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, float* rwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_zhbgvx( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* ka, lapack_int* kb, lapack_complex_double* ab, - lapack_int* ldab, lapack_complex_double* bb, - lapack_int* ldbb, lapack_complex_double* q, lapack_int* ldq, - double* vl, double* vu, lapack_int* il, lapack_int* iu, - double* abstol, lapack_int* m, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, double* rwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_sgges( char* jobvsl, char* jobvsr, char* sort, - LAPACK_S_SELECT3 selctg, lapack_int* n, float* a, - lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim, - float* alphar, float* alphai, float* beta, float* vsl, - lapack_int* ldvsl, float* vsr, lapack_int* ldvsr, - float* work, lapack_int* lwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_dgges( char* jobvsl, char* jobvsr, char* sort, - LAPACK_D_SELECT3 selctg, lapack_int* n, double* a, - lapack_int* lda, double* b, lapack_int* ldb, - lapack_int* sdim, double* alphar, double* alphai, - double* beta, double* vsl, lapack_int* ldvsl, double* vsr, - lapack_int* ldvsr, double* work, lapack_int* lwork, - lapack_logical* bwork, lapack_int *info ); -void LAPACK_cgges( char* jobvsl, char* jobvsr, char* sort, - LAPACK_C_SELECT2 selctg, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* vsl, lapack_int* ldvsl, - lapack_complex_float* vsr, lapack_int* ldvsr, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_logical* bwork, lapack_int *info ); -void LAPACK_zgges( char* jobvsl, char* jobvsr, char* sort, - LAPACK_Z_SELECT2 selctg, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* vsl, lapack_int* ldvsl, - lapack_complex_double* vsr, lapack_int* ldvsr, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_logical* bwork, lapack_int *info ); -void LAPACK_sgges3( char* jobvsl, char* jobvsr, char* sort, - LAPACK_S_SELECT3 selctg, lapack_int* n, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - lapack_int* sdim, float* alphar, float* alphai, - float* beta, float* vsl, lapack_int* ldvsl, - float* vsr, lapack_int* ldvsr, - float* work, lapack_int* lwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_dgges3( char* jobvsl, char* jobvsr, char* sort, - LAPACK_D_SELECT3 selctg, lapack_int* n, double* a, - lapack_int* lda, double* b, lapack_int* ldb, - lapack_int* sdim, double* alphar, double* alphai, - double* beta, double* vsl, lapack_int* ldvsl, double* vsr, - lapack_int* ldvsr, double* work, lapack_int* lwork, - lapack_logical* bwork, lapack_int *info ); -void LAPACK_cgges3( char* jobvsl, char* jobvsr, char* sort, - LAPACK_C_SELECT2 selctg, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_int* sdim, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* vsl, lapack_int* ldvsl, - lapack_complex_float* vsr, lapack_int* ldvsr, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_logical* bwork, lapack_int *info ); -void LAPACK_zgges3( char* jobvsl, char* jobvsr, char* sort, - LAPACK_Z_SELECT2 selctg, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* vsl, lapack_int* ldvsl, - lapack_complex_double* vsr, lapack_int* ldvsr, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_logical* bwork, lapack_int *info ); -void LAPACK_sggesx( char* jobvsl, char* jobvsr, char* sort, - LAPACK_S_SELECT3 selctg, char* sense, lapack_int* n, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - lapack_int* sdim, float* alphar, float* alphai, float* beta, - float* vsl, lapack_int* ldvsl, float* vsr, - lapack_int* ldvsr, float* rconde, float* rcondv, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_dggesx( char* jobvsl, char* jobvsr, char* sort, - LAPACK_D_SELECT3 selctg, char* sense, lapack_int* n, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - lapack_int* sdim, double* alphar, double* alphai, - double* beta, double* vsl, lapack_int* ldvsl, double* vsr, - lapack_int* ldvsr, double* rconde, double* rcondv, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* liwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_cggesx( char* jobvsl, char* jobvsr, char* sort, - LAPACK_C_SELECT2 selctg, char* sense, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* vsl, lapack_int* ldvsl, - lapack_complex_float* vsr, lapack_int* ldvsr, float* rconde, - float* rcondv, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int* iwork, - lapack_int* liwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_zggesx( char* jobvsl, char* jobvsr, char* sort, - LAPACK_Z_SELECT2 selctg, char* sense, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* vsl, lapack_int* ldvsl, - lapack_complex_double* vsr, lapack_int* ldvsr, - double* rconde, double* rcondv, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int* iwork, - lapack_int* liwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_sggev( char* jobvl, char* jobvr, lapack_int* n, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* alphar, - float* alphai, float* beta, float* vl, lapack_int* ldvl, - float* vr, lapack_int* ldvr, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dggev( char* jobvl, char* jobvr, lapack_int* n, double* a, - lapack_int* lda, double* b, lapack_int* ldb, double* alphar, - double* alphai, double* beta, double* vl, lapack_int* ldvl, - double* vr, lapack_int* ldvr, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cggev( char* jobvl, char* jobvr, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* vl, lapack_int* ldvl, - lapack_complex_float* vr, lapack_int* ldvr, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zggev( char* jobvl, char* jobvr, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* vl, lapack_int* ldvl, - lapack_complex_double* vr, lapack_int* ldvr, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_sggev3( char* jobvl, char* jobvr, lapack_int* n, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* alphar, - float* alphai, float* beta, float* vl, lapack_int* ldvl, - float* vr, lapack_int* ldvr, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dggev3( char* jobvl, char* jobvr, lapack_int* n, double* a, - lapack_int* lda, double* b, lapack_int* ldb, double* alphar, - double* alphai, double* beta, double* vl, lapack_int* ldvl, - double* vr, lapack_int* ldvr, double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_cggev3( char* jobvl, char* jobvr, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* vl, lapack_int* ldvl, - lapack_complex_float* vr, lapack_int* ldvr, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zggev3( char* jobvl, char* jobvr, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* vl, lapack_int* ldvl, - lapack_complex_double* vr, lapack_int* ldvr, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_sggevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, float* a, lapack_int* lda, float* b, - lapack_int* ldb, float* alphar, float* alphai, float* beta, - float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr, - lapack_int* ilo, lapack_int* ihi, float* lscale, - float* rscale, float* abnrm, float* bbnrm, float* rconde, - float* rcondv, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_dggevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, double* a, lapack_int* lda, double* b, - lapack_int* ldb, double* alphar, double* alphai, - double* beta, double* vl, lapack_int* ldvl, double* vr, - lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, - double* lscale, double* rscale, double* abnrm, - double* bbnrm, double* rconde, double* rcondv, double* work, - lapack_int* lwork, lapack_int* iwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_cggevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* vl, lapack_int* ldvl, - lapack_complex_float* vr, lapack_int* ldvr, lapack_int* ilo, - lapack_int* ihi, float* lscale, float* rscale, float* abnrm, - float* bbnrm, float* rconde, float* rcondv, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* iwork, lapack_logical* bwork, - lapack_int *info ); -void LAPACK_zggevx( char* balanc, char* jobvl, char* jobvr, char* sense, - lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* vl, lapack_int* ldvl, - lapack_complex_double* vr, lapack_int* ldvr, - lapack_int* ilo, lapack_int* ihi, double* lscale, - double* rscale, double* abnrm, double* bbnrm, - double* rconde, double* rcondv, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int* iwork, - lapack_logical* bwork, lapack_int *info ); -void LAPACK_dsfrk( char* transr, char* uplo, char* trans, lapack_int* n, - lapack_int* k, double* alpha, const double* a, - lapack_int* lda, double* beta, double* c ); -void LAPACK_ssfrk( char* transr, char* uplo, char* trans, lapack_int* n, - lapack_int* k, float* alpha, const float* a, lapack_int* lda, - float* beta, float* c ); -void LAPACK_zhfrk( char* transr, char* uplo, char* trans, lapack_int* n, - lapack_int* k, double* alpha, const lapack_complex_double* a, - lapack_int* lda, double* beta, lapack_complex_double* c ); -void LAPACK_chfrk( char* transr, char* uplo, char* trans, lapack_int* n, - lapack_int* k, float* alpha, const lapack_complex_float* a, - lapack_int* lda, float* beta, lapack_complex_float* c ); -void LAPACK_dtfsm( char* transr, char* side, char* uplo, char* trans, - char* diag, lapack_int* m, lapack_int* n, double* alpha, - const double* a, double* b, lapack_int* ldb ); -void LAPACK_stfsm( char* transr, char* side, char* uplo, char* trans, - char* diag, lapack_int* m, lapack_int* n, float* alpha, - const float* a, float* b, lapack_int* ldb ); -void LAPACK_ztfsm( char* transr, char* side, char* uplo, char* trans, - char* diag, lapack_int* m, lapack_int* n, - lapack_complex_double* alpha, const lapack_complex_double* a, - lapack_complex_double* b, lapack_int* ldb ); -void LAPACK_ctfsm( char* transr, char* side, char* uplo, char* trans, - char* diag, lapack_int* m, lapack_int* n, - lapack_complex_float* alpha, const lapack_complex_float* a, - lapack_complex_float* b, lapack_int* ldb ); -void LAPACK_dtfttp( char* transr, char* uplo, lapack_int* n, const double* arf, - double* ap, lapack_int *info ); -void LAPACK_stfttp( char* transr, char* uplo, lapack_int* n, const float* arf, - float* ap, lapack_int *info ); -void LAPACK_ztfttp( char* transr, char* uplo, lapack_int* n, - const lapack_complex_double* arf, lapack_complex_double* ap, - lapack_int *info ); -void LAPACK_ctfttp( char* transr, char* uplo, lapack_int* n, - const lapack_complex_float* arf, lapack_complex_float* ap, - lapack_int *info ); -void LAPACK_dtfttr( char* transr, char* uplo, lapack_int* n, const double* arf, - double* a, lapack_int* lda, lapack_int *info ); -void LAPACK_stfttr( char* transr, char* uplo, lapack_int* n, const float* arf, - float* a, lapack_int* lda, lapack_int *info ); -void LAPACK_ztfttr( char* transr, char* uplo, lapack_int* n, - const lapack_complex_double* arf, lapack_complex_double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_ctfttr( char* transr, char* uplo, lapack_int* n, - const lapack_complex_float* arf, lapack_complex_float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_dtpttf( char* transr, char* uplo, lapack_int* n, const double* ap, - double* arf, lapack_int *info ); -void LAPACK_stpttf( char* transr, char* uplo, lapack_int* n, const float* ap, - float* arf, lapack_int *info ); -void LAPACK_ztpttf( char* transr, char* uplo, lapack_int* n, - const lapack_complex_double* ap, lapack_complex_double* arf, - lapack_int *info ); -void LAPACK_ctpttf( char* transr, char* uplo, lapack_int* n, - const lapack_complex_float* ap, lapack_complex_float* arf, - lapack_int *info ); -void LAPACK_dtpttr( char* uplo, lapack_int* n, const double* ap, double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_stpttr( char* uplo, lapack_int* n, const float* ap, float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_ztpttr( char* uplo, lapack_int* n, const lapack_complex_double* ap, - lapack_complex_double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_ctpttr( char* uplo, lapack_int* n, const lapack_complex_float* ap, - lapack_complex_float* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_dtrttf( char* transr, char* uplo, lapack_int* n, const double* a, - lapack_int* lda, double* arf, lapack_int *info ); -void LAPACK_strttf( char* transr, char* uplo, lapack_int* n, const float* a, - lapack_int* lda, float* arf, lapack_int *info ); -void LAPACK_ztrttf( char* transr, char* uplo, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* arf, lapack_int *info ); -void LAPACK_ctrttf( char* transr, char* uplo, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* arf, lapack_int *info ); -void LAPACK_dtrttp( char* uplo, lapack_int* n, const double* a, lapack_int* lda, - double* ap, lapack_int *info ); -void LAPACK_strttp( char* uplo, lapack_int* n, const float* a, lapack_int* lda, - float* ap, lapack_int *info ); -void LAPACK_ztrttp( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* ap, - lapack_int *info ); -void LAPACK_ctrttp( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* ap, - lapack_int *info ); -void LAPACK_sgeqrfp( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dgeqrfp( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_clacgv( lapack_int* n, lapack_complex_float* x, lapack_int* incx ); -void LAPACK_zlacgv( lapack_int* n, lapack_complex_double* x, lapack_int* incx ); -void LAPACK_slarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n, - float* x ); -void LAPACK_dlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n, - double* x ); -void LAPACK_clarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n, - lapack_complex_float* x ); -void LAPACK_zlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n, - lapack_complex_double* x ); -void LAPACK_sgeqr2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int *info ); -void LAPACK_dgeqr2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int *info ); -void LAPACK_cgeqr2( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zgeqr2( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_slacn2( lapack_int* n, float* v, float* x, lapack_int* isgn, - float* est, lapack_int* kase, lapack_int* isave ); -void LAPACK_dlacn2( lapack_int* n, double* v, double* x, lapack_int* isgn, - double* est, lapack_int* kase, lapack_int* isave ); -void LAPACK_clacn2( lapack_int* n, lapack_complex_float* v, - lapack_complex_float* x, float* est, - lapack_int* kase, lapack_int* isave ); -void LAPACK_zlacn2( lapack_int* n, lapack_complex_double* v, - lapack_complex_double* x, double* est, - lapack_int* kase, lapack_int* isave ); -void LAPACK_slacpy( char* uplo, lapack_int* m, lapack_int* n, const float* a, - lapack_int* lda, float* b, lapack_int* ldb ); -void LAPACK_dlacpy( char* uplo, lapack_int* m, lapack_int* n, const double* a, - lapack_int* lda, double* b, lapack_int* ldb ); -void LAPACK_clacpy( char* uplo, lapack_int* m, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb ); -void LAPACK_zlacpy( char* uplo, lapack_int* m, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb ); - -void LAPACK_clacp2( char* uplo, lapack_int* m, lapack_int* n, const float* a, - lapack_int* lda, lapack_complex_float* b, lapack_int* ldb ); -void LAPACK_zlacp2( char* uplo, lapack_int* m, lapack_int* n, const double* a, - lapack_int* lda, lapack_complex_double* b, - lapack_int* ldb ); - -void LAPACK_sgetf2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_dgetf2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - lapack_int* ipiv, lapack_int *info ); -void LAPACK_cgetf2( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, lapack_int *info ); -void LAPACK_zgetf2( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, lapack_int *info ); -void LAPACK_slaswp( lapack_int* n, float* a, lapack_int* lda, lapack_int* k1, - lapack_int* k2, const lapack_int* ipiv, lapack_int* incx ); -void LAPACK_dlaswp( lapack_int* n, double* a, lapack_int* lda, lapack_int* k1, - lapack_int* k2, const lapack_int* ipiv, lapack_int* incx ); -void LAPACK_claswp( lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_int* k1, lapack_int* k2, const lapack_int* ipiv, - lapack_int* incx ); -void LAPACK_zlaswp( lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_int* k1, lapack_int* k2, const lapack_int* ipiv, - lapack_int* incx ); -float LAPACK_slange( char* norm, lapack_int* m, lapack_int* n, const float* a, - lapack_int* lda, float* work ); -double LAPACK_dlange( char* norm, lapack_int* m, lapack_int* n, const double* a, - lapack_int* lda, double* work ); -float LAPACK_clange( char* norm, lapack_int* m, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, float* work ); -double LAPACK_zlange( char* norm, lapack_int* m, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, double* work ); -float LAPACK_clanhe( char* norm, char* uplo, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, float* work ); -double LAPACK_zlanhe( char* norm, char* uplo, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, double* work ); -void LAPACK_clarcm( lapack_int* m, lapack_int* n, const float* a, - lapack_int* lda, const lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* c, - lapack_int* ldc, float* work ); -void LAPACK_zlarcm( lapack_int* m, lapack_int* n, const double* a, - lapack_int* lda, const lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* c, - lapack_int* ldc, double* work ); -void LAPACK_clacrm( lapack_int* m, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const float* b, - lapack_int* ldb, lapack_complex_float* c, - lapack_int* ldc, float* work ); -void LAPACK_zlacrm( lapack_int* m, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const double* b, - lapack_int* ldb, lapack_complex_double* c, - lapack_int* ldc, double* work ); -float LAPACK_slansy( char* norm, char* uplo, lapack_int* n, const float* a, - lapack_int* lda, float* work ); -double LAPACK_dlansy( char* norm, char* uplo, lapack_int* n, const double* a, - lapack_int* lda, double* work ); -float LAPACK_clansy( char* norm, char* uplo, lapack_int* n, - const lapack_complex_float* a, lapack_int* lda, float* work ); -double LAPACK_zlansy( char* norm, char* uplo, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, double* work ); -float LAPACK_slantr( char* norm, char* uplo, char* diag, lapack_int* m, - lapack_int* n, const float* a, lapack_int* lda, float* work ); -double LAPACK_dlantr( char* norm, char* uplo, char* diag, lapack_int* m, - lapack_int* n, const double* a, lapack_int* lda, double* work ); -float LAPACK_clantr( char* norm, char* uplo, char* diag, lapack_int* m, - lapack_int* n, const lapack_complex_float* a, lapack_int* lda, - float* work ); -double LAPACK_zlantr( char* norm, char* uplo, char* diag, lapack_int* m, - lapack_int* n, const lapack_complex_double* a, lapack_int* lda, - double* work ); -float LAPACK_slamch( char* cmach ); -double LAPACK_dlamch( char* cmach ); -void LAPACK_sgelq2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* tau, float* work, lapack_int *info ); -void LAPACK_dgelq2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* tau, double* work, lapack_int *info ); -void LAPACK_cgelq2( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* tau, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zgelq2( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* tau, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_slarfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, const float* v, - lapack_int* ldv, const float* t, lapack_int* ldt, float* c, - lapack_int* ldc, float* work, lapack_int* ldwork ); -void LAPACK_dlarfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, - const double* v, lapack_int* ldv, const double* t, - lapack_int* ldt, double* c, lapack_int* ldc, double* work, - lapack_int* ldwork ); -void LAPACK_clarfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, - const lapack_complex_float* v, lapack_int* ldv, - const lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* ldwork ); -void LAPACK_zlarfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, - const lapack_complex_double* v, lapack_int* ldv, - const lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* ldwork ); -void LAPACK_slarfg( lapack_int* n, float* alpha, float* x, lapack_int* incx, - float* tau ); -void LAPACK_dlarfg( lapack_int* n, double* alpha, double* x, lapack_int* incx, - double* tau ); -void LAPACK_clarfg( lapack_int* n, lapack_complex_float* alpha, - lapack_complex_float* x, lapack_int* incx, - lapack_complex_float* tau ); -void LAPACK_zlarfg( lapack_int* n, lapack_complex_double* alpha, - lapack_complex_double* x, lapack_int* incx, - lapack_complex_double* tau ); -void LAPACK_slassq( lapack_int *n, float* x, lapack_int *incx, float* scale, float* sumsq ); -void LAPACK_dlassq( lapack_int *n, double* x, lapack_int *incx, double* scale, double* sumsq ); -void LAPACK_classq( lapack_int *n, lapack_complex_float* x, lapack_int *incx, float* scale, float* sumsq ); -void LAPACK_zlassq( lapack_int *n, lapack_complex_double* x, lapack_int *incx, double* scale, double* sumsq ); -void LAPACK_slarft( char* direct, char* storev, lapack_int* n, lapack_int* k, - const float* v, lapack_int* ldv, const float* tau, float* t, - lapack_int* ldt ); -void LAPACK_dlarft( char* direct, char* storev, lapack_int* n, lapack_int* k, - const double* v, lapack_int* ldv, const double* tau, - double* t, lapack_int* ldt ); -void LAPACK_clarft( char* direct, char* storev, lapack_int* n, lapack_int* k, - const lapack_complex_float* v, lapack_int* ldv, - const lapack_complex_float* tau, lapack_complex_float* t, - lapack_int* ldt ); -void LAPACK_zlarft( char* direct, char* storev, lapack_int* n, lapack_int* k, - const lapack_complex_double* v, lapack_int* ldv, - const lapack_complex_double* tau, lapack_complex_double* t, - lapack_int* ldt ); -void LAPACK_slarfx( char* side, lapack_int* m, lapack_int* n, const float* v, - float* tau, float* c, lapack_int* ldc, float* work ); -void LAPACK_dlarfx( char* side, lapack_int* m, lapack_int* n, const double* v, - double* tau, double* c, lapack_int* ldc, double* work ); -void LAPACK_clarfx( char* side, lapack_int* m, lapack_int* n, - const lapack_complex_float* v, lapack_complex_float* tau, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work ); -void LAPACK_zlarfx( char* side, lapack_int* m, lapack_int* n, - const lapack_complex_double* v, lapack_complex_double* tau, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work ); -void LAPACK_slatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, - char* sym, float* d, lapack_int* mode, float* cond, - float* dmax, lapack_int* kl, lapack_int* ku, char* pack, - float* a, lapack_int* lda, float* work, lapack_int *info ); -void LAPACK_dlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, - char* sym, double* d, lapack_int* mode, double* cond, - double* dmax, lapack_int* kl, lapack_int* ku, char* pack, - double* a, lapack_int* lda, double* work, - lapack_int *info ); -void LAPACK_clatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, - char* sym, float* d, lapack_int* mode, float* cond, - float* dmax, lapack_int* kl, lapack_int* ku, char* pack, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, - char* sym, double* d, lapack_int* mode, double* cond, - double* dmax, lapack_int* kl, lapack_int* ku, char* pack, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_slag2d( lapack_int* m, lapack_int* n, const float* sa, - lapack_int* ldsa, double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_dlag2s( lapack_int* m, lapack_int* n, const double* a, - lapack_int* lda, float* sa, lapack_int* ldsa, - lapack_int *info ); -void LAPACK_clag2z( lapack_int* m, lapack_int* n, - const lapack_complex_float* sa, lapack_int* ldsa, - lapack_complex_double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_zlag2c( lapack_int* m, lapack_int* n, - const lapack_complex_double* a, lapack_int* lda, - lapack_complex_float* sa, lapack_int* ldsa, - lapack_int *info ); -void LAPACK_slauum( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_dlauum( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int *info ); -void LAPACK_clauum( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_zlauum( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_slagge( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const float* d, float* a, lapack_int* lda, - lapack_int* iseed, float* work, lapack_int *info ); -void LAPACK_dlagge( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const double* d, double* a, lapack_int* lda, - lapack_int* iseed, double* work, lapack_int *info ); -void LAPACK_clagge( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const float* d, lapack_complex_float* a, - lapack_int* lda, lapack_int* iseed, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zlagge( lapack_int* m, lapack_int* n, lapack_int* kl, - lapack_int* ku, const double* d, lapack_complex_double* a, - lapack_int* lda, lapack_int* iseed, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_slascl( char* type, lapack_int* kl, lapack_int* ku, float* cfrom, - float* cto, lapack_int* m, lapack_int* n, float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_dlascl( char* type, lapack_int* kl, lapack_int* ku, double* cfrom, - double* cto, lapack_int* m, lapack_int* n, double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_clascl( char* type, lapack_int* kl, lapack_int* ku, float* cfrom, - float* cto, lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_zlascl( char* type, lapack_int* kl, lapack_int* ku, double* cfrom, - double* cto, lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int *info ); -void LAPACK_slaset( char* uplo, lapack_int* m, lapack_int* n, float* alpha, - float* beta, float* a, lapack_int* lda ); -void LAPACK_dlaset( char* uplo, lapack_int* m, lapack_int* n, double* alpha, - double* beta, double* a, lapack_int* lda ); -void LAPACK_claset( char* uplo, lapack_int* m, lapack_int* n, - lapack_complex_float* alpha, lapack_complex_float* beta, - lapack_complex_float* a, lapack_int* lda ); -void LAPACK_zlaset( char* uplo, lapack_int* m, lapack_int* n, - lapack_complex_double* alpha, lapack_complex_double* beta, - lapack_complex_double* a, lapack_int* lda ); -void LAPACK_slasrt( char* id, lapack_int* n, float* d, lapack_int *info ); -void LAPACK_dlasrt( char* id, lapack_int* n, double* d, lapack_int *info ); -void LAPACK_claghe( lapack_int* n, lapack_int* k, const float* d, - lapack_complex_float* a, lapack_int* lda, lapack_int* iseed, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zlaghe( lapack_int* n, lapack_int* k, const double* d, - lapack_complex_double* a, lapack_int* lda, - lapack_int* iseed, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_slagsy( lapack_int* n, lapack_int* k, const float* d, float* a, - lapack_int* lda, lapack_int* iseed, float* work, - lapack_int *info ); -void LAPACK_dlagsy( lapack_int* n, lapack_int* k, const double* d, double* a, - lapack_int* lda, lapack_int* iseed, double* work, - lapack_int *info ); -void LAPACK_clagsy( lapack_int* n, lapack_int* k, const float* d, - lapack_complex_float* a, lapack_int* lda, lapack_int* iseed, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zlagsy( lapack_int* n, lapack_int* k, const double* d, - lapack_complex_double* a, lapack_int* lda, - lapack_int* iseed, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_slapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - float* x, lapack_int* ldx, lapack_int* k ); -void LAPACK_dlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - double* x, lapack_int* ldx, lapack_int* k ); -void LAPACK_clapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - lapack_complex_float* x, lapack_int* ldx, lapack_int* k ); -void LAPACK_zlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - lapack_complex_double* x, lapack_int* ldx, lapack_int* k ); -void LAPACK_slapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - float* x, lapack_int* ldx, lapack_int* k ); -void LAPACK_dlapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - double* x, lapack_int* ldx, lapack_int* k ); -void LAPACK_clapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - lapack_complex_float* x, lapack_int* ldx, lapack_int* k ); -void LAPACK_zlapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n, - lapack_complex_double* x, lapack_int* ldx, lapack_int* k ); -float LAPACK_slapy2( float* x, float* y ); -double LAPACK_dlapy2( double* x, double* y ); -float LAPACK_slapy3( float* x, float* y, float* z ); -double LAPACK_dlapy3( double* x, double* y, double* z ); -void LAPACK_slartgp( float* f, float* g, float* cs, float* sn, float* r ); -void LAPACK_dlartgp( double* f, double* g, double* cs, double* sn, double* r ); -void LAPACK_slartgs( float* x, float* y, float* sigma, float* cs, float* sn ); -void LAPACK_dlartgs( double* x, double* y, double* sigma, double* cs, - double* sn ); -// LAPACK 3.3.0 -void LAPACK_cbbcsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - lapack_int* m, lapack_int* p, lapack_int* q, - float* theta, float* phi, - lapack_complex_float* u1, lapack_int* ldu1, - lapack_complex_float* u2, lapack_int* ldu2, - lapack_complex_float* v1t, lapack_int* ldv1t, - lapack_complex_float* v2t, lapack_int* ldv2t, - float* b11d, float* b11e, float* b12d, - float* b12e, float* b21d, float* b21e, - float* b22d, float* b22e, float* rwork, - lapack_int* lrwork , lapack_int *info ); -void LAPACK_cheswapr( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* i1, lapack_int* i2 ); -void LAPACK_chetri2( char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_chetri2x( char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_float* work, lapack_int* nb , lapack_int *info ); -void LAPACK_chetrs2( char* uplo, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work , lapack_int *info ); -void LAPACK_csyconv( char* uplo, char* way, - lapack_int* n, lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_float* e , lapack_int *info ); -void LAPACK_csyswapr( char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_int* i1, lapack_int* i2 ); -void LAPACK_csytri2( char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_csytri2x( char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_float* work, lapack_int* nb , lapack_int *info ); -void LAPACK_csytrs2( char* uplo, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work , lapack_int *info ); -void LAPACK_cunbdb( char* trans, char* signs, - lapack_int* m, lapack_int* p, lapack_int* q, - lapack_complex_float* x11, lapack_int* ldx11, - lapack_complex_float* x12, lapack_int* ldx12, - lapack_complex_float* x21, lapack_int* ldx21, - lapack_complex_float* x22, lapack_int* ldx22, - float* theta, float* phi, - lapack_complex_float* taup1, - lapack_complex_float* taup2, - lapack_complex_float* tauq1, - lapack_complex_float* tauq2, - lapack_complex_float* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_cuncsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - char* signs, lapack_int* m, lapack_int* p, - lapack_int* q, lapack_complex_float* x11, - lapack_int* ldx11, lapack_complex_float* x12, - lapack_int* ldx12, lapack_complex_float* x21, - lapack_int* ldx21, lapack_complex_float* x22, - lapack_int* ldx22, float* theta, - lapack_complex_float* u1, lapack_int* ldu1, - lapack_complex_float* u2, lapack_int* ldu2, - lapack_complex_float* v1t, lapack_int* ldv1t, - lapack_complex_float* v2t, lapack_int* ldv2t, - lapack_complex_float* work, lapack_int* lwork, - float* rwork, lapack_int* lrwork, - lapack_int* iwork , lapack_int *info ); -void LAPACK_cuncsd2by1( char* jobu1, char* jobu2, - char* jobv1t, lapack_int* m, lapack_int* p, - lapack_int* q, lapack_complex_float* x11, - lapack_int* ldx11, lapack_complex_float* x21, - lapack_int* ldx21, float* theta, - lapack_complex_float* u1, lapack_int* ldu1, - lapack_complex_float* u2, lapack_int* ldu2, - lapack_complex_float* v1t, lapack_int* ldv1t, - lapack_complex_float* work, lapack_int* lwork, - float* rwork, lapack_int* lrwork, - lapack_int* iwork , lapack_int *info ); -void LAPACK_dbbcsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - lapack_int* m, lapack_int* p, lapack_int* q, - double* theta, double* phi, double* u1, - lapack_int* ldu1, double* u2, lapack_int* ldu2, - double* v1t, lapack_int* ldv1t, double* v2t, - lapack_int* ldv2t, double* b11d, double* b11e, - double* b12d, double* b12e, double* b21d, - double* b21e, double* b22d, double* b22e, - double* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_dorbdb( char* trans, char* signs, - lapack_int* m, lapack_int* p, lapack_int* q, - double* x11, lapack_int* ldx11, double* x12, - lapack_int* ldx12, double* x21, lapack_int* ldx21, - double* x22, lapack_int* ldx22, double* theta, - double* phi, double* taup1, double* taup2, - double* tauq1, double* tauq2, double* work, - lapack_int* lwork , lapack_int *info ); -void LAPACK_dorcsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - char* signs, lapack_int* m, lapack_int* p, - lapack_int* q, double* x11, lapack_int* ldx11, - double* x12, lapack_int* ldx12, double* x21, - lapack_int* ldx21, double* x22, lapack_int* ldx22, - double* theta, double* u1, lapack_int* ldu1, - double* u2, lapack_int* ldu2, double* v1t, - lapack_int* ldv1t, double* v2t, lapack_int* ldv2t, - double* work, lapack_int* lwork, - lapack_int* iwork , lapack_int *info ); -void LAPACK_dorcsd2by1( char* jobu1, char* jobu2, - char* jobv1t, lapack_int* m, lapack_int* p, - lapack_int* q, double* x11, lapack_int* ldx11, - double* x21, lapack_int* ldx21, - double* theta, double* u1, lapack_int* ldu1, - double* u2, lapack_int* ldu2, double* v1t, - lapack_int* ldv1t, double* work, lapack_int* lwork, - lapack_int* iwork , lapack_int *info ); -void LAPACK_dsyconv( char* uplo, char* way, - lapack_int* n, double* a, lapack_int* lda, - const lapack_int* ipiv, double* e , lapack_int *info ); -void LAPACK_dsyswapr( char* uplo, lapack_int* n, double* a, - lapack_int* lda, lapack_int* i1, lapack_int* i2 ); -void LAPACK_dsytri2( char* uplo, lapack_int* n, - double* a, lapack_int* lda, - const lapack_int* ipiv, - double* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_dsytri2x( char* uplo, lapack_int* n, - double* a, lapack_int* lda, - const lapack_int* ipiv, double* work, - lapack_int* nb , lapack_int *info ); -void LAPACK_dsytrs2( char* uplo, lapack_int* n, - lapack_int* nrhs, const double* a, - lapack_int* lda, const lapack_int* ipiv, - double* b, lapack_int* ldb, double* work , lapack_int *info ); -void LAPACK_sbbcsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - lapack_int* m, lapack_int* p, lapack_int* q, - float* theta, float* phi, float* u1, - lapack_int* ldu1, float* u2, lapack_int* ldu2, - float* v1t, lapack_int* ldv1t, float* v2t, - lapack_int* ldv2t, float* b11d, float* b11e, - float* b12d, float* b12e, float* b21d, - float* b21e, float* b22d, float* b22e, - float* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_sorbdb( char* trans, char* signs, - lapack_int* m, lapack_int* p, lapack_int* q, - float* x11, lapack_int* ldx11, float* x12, - lapack_int* ldx12, float* x21, lapack_int* ldx21, - float* x22, lapack_int* ldx22, float* theta, - float* phi, float* taup1, float* taup2, - float* tauq1, float* tauq2, float* work, - lapack_int* lwork , lapack_int *info ); -void LAPACK_sorcsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - char* signs, lapack_int* m, lapack_int* p, - lapack_int* q, float* x11, lapack_int* ldx11, - float* x12, lapack_int* ldx12, float* x21, - lapack_int* ldx21, float* x22, lapack_int* ldx22, - float* theta, float* u1, lapack_int* ldu1, - float* u2, lapack_int* ldu2, float* v1t, - lapack_int* ldv1t, float* v2t, lapack_int* ldv2t, - float* work, lapack_int* lwork, - lapack_int* iwork , lapack_int *info ); -void LAPACK_sorcsd2by1( char* jobu1, char* jobu2, - char* jobv1t, lapack_int* m, lapack_int* p, - lapack_int* q, float* x11, lapack_int* ldx11, - float* x21, lapack_int* ldx21, - float* theta, float* u1, lapack_int* ldu1, - float* u2, lapack_int* ldu2, float* v1t, - lapack_int* ldv1t, float* work, lapack_int* lwork, - lapack_int* iwork , lapack_int *info ); -void LAPACK_ssyconv( char* uplo, char* way, - lapack_int* n, float* a, lapack_int* lda, - const lapack_int* ipiv, float* e , lapack_int *info ); -void LAPACK_ssyswapr( char* uplo, lapack_int* n, float* a, - lapack_int* lda, lapack_int* i1, lapack_int* i2 ); -void LAPACK_ssytri2( char* uplo, lapack_int* n, - float* a, lapack_int* lda, - const lapack_int* ipiv, - float* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_ssytri2x( char* uplo, lapack_int* n, - float* a, lapack_int* lda, - const lapack_int* ipiv, float* work, - lapack_int* nb , lapack_int *info ); -void LAPACK_ssytrs2( char* uplo, lapack_int* n, - lapack_int* nrhs, const float* a, - lapack_int* lda, const lapack_int* ipiv, - float* b, lapack_int* ldb, float* work , lapack_int *info ); -void LAPACK_zbbcsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - lapack_int* m, lapack_int* p, lapack_int* q, - double* theta, double* phi, - lapack_complex_double* u1, lapack_int* ldu1, - lapack_complex_double* u2, lapack_int* ldu2, - lapack_complex_double* v1t, lapack_int* ldv1t, - lapack_complex_double* v2t, lapack_int* ldv2t, - double* b11d, double* b11e, double* b12d, - double* b12e, double* b21d, double* b21e, - double* b22d, double* b22e, double* rwork, - lapack_int* lrwork , lapack_int *info ); -void LAPACK_zheswapr( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* i1, lapack_int* i2 ); -void LAPACK_zhetri2( char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_zhetri2x( char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* work, lapack_int* nb , lapack_int *info ); -void LAPACK_zhetrs2( char* uplo, lapack_int* n, - lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work , lapack_int *info ); -void LAPACK_zsyconv( char* uplo, char* way, - lapack_int* n, lapack_complex_double* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_double* e , lapack_int *info ); -void LAPACK_zsyswapr( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* i1, - lapack_int* i2 ); -void LAPACK_zsytri2( char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_zsytri2x( char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* work, lapack_int* nb , lapack_int *info ); -void LAPACK_zsytrs2( char* uplo, lapack_int* n, - lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work , lapack_int *info ); -void LAPACK_zunbdb( char* trans, char* signs, - lapack_int* m, lapack_int* p, lapack_int* q, - lapack_complex_double* x11, lapack_int* ldx11, - lapack_complex_double* x12, lapack_int* ldx12, - lapack_complex_double* x21, lapack_int* ldx21, - lapack_complex_double* x22, lapack_int* ldx22, - double* theta, double* phi, - lapack_complex_double* taup1, - lapack_complex_double* taup2, - lapack_complex_double* tauq1, - lapack_complex_double* tauq2, - lapack_complex_double* work, lapack_int* lwork , lapack_int *info ); -void LAPACK_zuncsd( char* jobu1, char* jobu2, - char* jobv1t, char* jobv2t, char* trans, - char* signs, lapack_int* m, lapack_int* p, - lapack_int* q, lapack_complex_double* x11, - lapack_int* ldx11, lapack_complex_double* x12, - lapack_int* ldx12, lapack_complex_double* x21, - lapack_int* ldx21, lapack_complex_double* x22, - lapack_int* ldx22, double* theta, - lapack_complex_double* u1, lapack_int* ldu1, - lapack_complex_double* u2, lapack_int* ldu2, - lapack_complex_double* v1t, lapack_int* ldv1t, - lapack_complex_double* v2t, lapack_int* ldv2t, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, - lapack_int* iwork , lapack_int *info ); -void LAPACK_zuncsd2by1( char* jobu1, char* jobu2, - char* jobv1t, lapack_int* m, lapack_int* p, - lapack_int* q, lapack_complex_double* x11, - lapack_int* ldx11, lapack_complex_double* x21, - lapack_int* ldx21, double* theta, - lapack_complex_double* u1, lapack_int* ldu1, - lapack_complex_double* u2, lapack_int* ldu2, - lapack_complex_double* v1t, lapack_int* ldv1t, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, - lapack_int* iwork , lapack_int *info ); -// LAPACK 3.4.0 -void LAPACK_sgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* nb, const float* v, - lapack_int* ldv, const float* t, lapack_int* ldt, float* c, - lapack_int* ldc, float* work, lapack_int *info ); -void LAPACK_dgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* nb, const double* v, - lapack_int* ldv, const double* t, lapack_int* ldt, - double* c, lapack_int* ldc, double* work, - lapack_int *info ); -void LAPACK_cgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* nb, - const lapack_complex_float* v, lapack_int* ldv, - const lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* nb, - const lapack_complex_double* v, lapack_int* ldv, - const lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_sgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, float* a, - lapack_int* lda, float* t, lapack_int* ldt, float* work, - lapack_int *info ); -void LAPACK_dgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, double* a, - lapack_int* lda, double* t, lapack_int* ldt, double* work, - lapack_int *info ); -void LAPACK_cgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_zgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_sgeqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* t, lapack_int* ldt, lapack_int *info ); -void LAPACK_dgeqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* t, lapack_int* ldt, lapack_int *info ); -void LAPACK_cgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_zgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_sgeqrt3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* t, lapack_int* ldt, lapack_int *info ); -void LAPACK_dgeqrt3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* t, lapack_int* ldt, lapack_int *info ); -void LAPACK_cgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_zgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_stpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, lapack_int* nb, - const float* v, lapack_int* ldv, const float* t, - lapack_int* ldt, float* a, lapack_int* lda, float* b, - lapack_int* ldb, float* work, lapack_int *info ); -void LAPACK_dtpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, lapack_int* nb, - const double* v, lapack_int* ldv, const double* t, - lapack_int* ldt, double* a, lapack_int* lda, double* b, - lapack_int* ldb, double* work, lapack_int *info ); -void LAPACK_ctpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, lapack_int* nb, - const lapack_complex_float* v, lapack_int* ldv, - const lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_ztpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n, - lapack_int* k, lapack_int* l, lapack_int* nb, - const lapack_complex_double* v, lapack_int* ldv, - const lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_stpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* t, lapack_int* ldt, float* work, lapack_int *info ); -void LAPACK_dtpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* t, lapack_int* ldt, double* work, - lapack_int *info ); -void LAPACK_ctpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* work, lapack_int *info ); -void LAPACK_ztpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* work, lapack_int *info ); -void LAPACK_stpqrt2( lapack_int* m, lapack_int* n, lapack_int* l, - float* a, lapack_int* lda, - float* b, lapack_int* ldb, - float* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_dtpqrt2( lapack_int* m, lapack_int* n, lapack_int* l, - double* a, lapack_int* lda, - double* b, lapack_int* ldb, - double* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_ctpqrt2( lapack_int* m, lapack_int* n, lapack_int* l, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_ztpqrt2( lapack_int* m, lapack_int* n, lapack_int* l, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* t, lapack_int* ldt, - lapack_int *info ); -void LAPACK_stprfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, - const float* v, lapack_int* ldv, const float* t, - lapack_int* ldt, float* a, lapack_int* lda, float* b, - lapack_int* ldb, const float* work, - lapack_int* ldwork ); -void LAPACK_dtprfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, - const double* v, lapack_int* ldv, const double* t, - lapack_int* ldt, double* a, lapack_int* lda, double* b, - lapack_int* ldb, const double* work, - lapack_int* ldwork ); -void LAPACK_ctprfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, - const lapack_complex_float* v, lapack_int* ldv, - const lapack_complex_float* t, lapack_int* ldt, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* ldwork ); -void LAPACK_ztprfb( char* side, char* trans, char* direct, char* storev, - lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, - const lapack_complex_double* v, lapack_int* ldv, - const lapack_complex_double* t, lapack_int* ldt, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* ldwork ); -// LAPACK 3.5.0 -void LAPACK_ssysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, lapack_int* ipiv, float* b, - lapack_int* ldb, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ssytrf_rook( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int* ipiv, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dsysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, lapack_int* ipiv, double* b, - lapack_int* ldb, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dsytrf_rook( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int* ipiv, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_int* ipiv, lapack_complex_float* b, - lapack_int* ldb, lapack_complex_float* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_csytrf_rook( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_complex_double* work, - lapack_int* lwork, lapack_int *info ); -void LAPACK_zsytrf_rook( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_ssytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, - lapack_int* lda, const lapack_int* ipiv, float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_dsytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs, - const double* a, lapack_int* lda, const lapack_int* ipiv, - double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_csytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zsytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_chetrf_rook( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhetrf_rook( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chetrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_float* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_float* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_zhetrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, lapack_complex_double* b, - lapack_int* ldb, lapack_int *info ); - -void LAPACK_csyr( char* uplo, lapack_int* n, lapack_complex_float* alpha, - const lapack_complex_float* x, lapack_int* incx, - lapack_complex_float* a, lapack_int* lda ); -void LAPACK_zsyr( char* uplo, lapack_int* n, lapack_complex_double* alpha, - const lapack_complex_double* x, lapack_int* incx, - lapack_complex_double* a, lapack_int* lda ); -void LAPACK_ilaver( const lapack_int* vers_major, const lapack_int* vers_minor, - const lapack_int* vers_patch ); - -// LAPACK 3.7.0 -void LAPACK_ssysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, lapack_int* ipiv, float* b, lapack_int* ldb, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, lapack_int* ipiv, double* b, - lapack_int* ldb, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chesv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhesv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); - -void LAPACK_ssytrf_aa( char* uplo, lapack_int* n, float* a, lapack_int* lda, - lapack_int* ipiv, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dsytrf_aa( char* uplo, lapack_int* n, double* a, lapack_int* lda, - lapack_int* ipiv, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csytrf_aa( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsytrf_aa( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chetrf_aa( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhetrf_aa( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); - -void LAPACK_ssytrs_aa( char* uplo, lapack_int* n, - lapack_int* nrhs, const float* a, - lapack_int* lda, const lapack_int* ipiv, - float* b, lapack_int* ldb, float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsytrs_aa( char* uplo, lapack_int* n, - lapack_int* nrhs, const double* a, - lapack_int* lda, const lapack_int* ipiv, - double* b, lapack_int* ldb, double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_csytrs_aa( char* uplo, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work , lapack_int* lwork, lapack_int *info ); -void LAPACK_zsytrs_aa( char* uplo, lapack_int* n, - lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_chetrs_aa( char* uplo, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work , lapack_int* lwork, lapack_int *info ); -void LAPACK_zhetrs_aa( char* uplo, lapack_int* n, - lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, lapack_int *info ); - -void LAPACK_ssysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, float* a, - lapack_int* lda, float* e, lapack_int* ipiv, float* b, lapack_int* ldb, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* e, lapack_int* ipiv, double* b, - lapack_int* ldb, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* e, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* e, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chesv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* e, lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhesv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* e, lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); - -void LAPACK_ssytrf_rk( char* uplo, lapack_int* n, float* a, lapack_int* lda, - float* e, lapack_int* ipiv, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dsytrf_rk( char* uplo, lapack_int* n, double* a, lapack_int* lda, - double* e, lapack_int* ipiv, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csytrf_rk( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* e, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsytrf_rk( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* e, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chetrf_rk( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, lapack_complex_float* e, lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhetrf_rk( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, lapack_complex_double* e, lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); - -void LAPACK_ssytrs_3( char* uplo, lapack_int* n, - lapack_int* nrhs, const float* a, - lapack_int* lda, const float* e, const lapack_int* ipiv, - float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_dsytrs_3( char* uplo, lapack_int* n, - lapack_int* nrhs, const double* a, - lapack_int* lda, const double* e, const lapack_int* ipiv, - double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_csytrs_3( char* uplo, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* e, - const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_zsytrs_3( char* uplo, lapack_int* n, - lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* e, const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_chetrs_3( char* uplo, lapack_int* n, - lapack_int* nrhs, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* e, - const lapack_int* ipiv, - lapack_complex_float* b, lapack_int* ldb, lapack_int *info ); -void LAPACK_zhetrs_3( char* uplo, lapack_int* n, - lapack_int* nrhs, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* e, - const lapack_int* ipiv, - lapack_complex_double* b, lapack_int* ldb, lapack_int *info ); - -void LAPACK_ssytri_3( char* uplo, lapack_int* n, float* a, lapack_int* lda, const float* e, - const lapack_int* ipiv, float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsytri_3( char* uplo, lapack_int* n, double* a, lapack_int* lda, const double* e, - const lapack_int* ipiv, double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_csytri_3( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_zsytri_3( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_chetri_3( char* uplo, lapack_int* n, lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv, - lapack_complex_float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_zhetri_3( char* uplo, lapack_int* n, lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv, - lapack_complex_double* work, lapack_int* lwork, lapack_int *info ); - -void LAPACK_ssycon_3( char* uplo, lapack_int* n, const float* a, lapack_int* lda, const float* e, - const lapack_int* ipiv, float* anorm, float* rcond, - float* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_dsycon_3( char* uplo, lapack_int* n, const double* a, lapack_int* lda, const double* e, - const lapack_int* ipiv, double* anorm, double* rcond, - double* work, lapack_int* iwork, lapack_int *info ); -void LAPACK_csycon_3( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv, float* anorm, - float* rcond, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zsycon_3( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv, double* anorm, - double* rcond, lapack_complex_double* work, - lapack_int *info ); -void LAPACK_checon_3( char* uplo, lapack_int* n, const lapack_complex_float* a, - lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv, float* anorm, - float* rcond, lapack_complex_float* work, - lapack_int *info ); -void LAPACK_zhecon_3( char* uplo, lapack_int* n, const lapack_complex_double* a, - lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv, double* anorm, - double* rcond, lapack_complex_double* work, - lapack_int *info ); - -void LAPACK_sgelq( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* t, lapack_int* tsize, float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_dgelq( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* t, lapack_int* tsize, double* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_cgelq( lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* t, lapack_int* tsize, lapack_complex_float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_zgelq( lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* t, lapack_int* tsize, lapack_complex_double* work, lapack_int* lwork, - lapack_int* info ); - -void LAPACK_sgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const float* a, lapack_int* lda, - const float* t, lapack_int* tsize, - float* c, lapack_int* ldc, - float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_dgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const double* a, lapack_int* lda, - const double* t, lapack_int* tsize, - double* c, lapack_int* ldc, - double* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_cgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* t, lapack_int* tsize, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_zgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* t, lapack_int* tsize, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* lwork, - lapack_int* info ); - -void LAPACK_sgeqr( lapack_int* m, lapack_int* n, float* a, lapack_int* lda, - float* t, lapack_int* tsize, float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_dgeqr( lapack_int* m, lapack_int* n, double* a, lapack_int* lda, - double* t, lapack_int* tsize, double* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_cgeqr( lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* t, lapack_int* tsize, lapack_complex_float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_zgeqr( lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* t, lapack_int* tsize, lapack_complex_double* work, lapack_int* lwork, - lapack_int* info ); - -void LAPACK_sgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const float* a, lapack_int* lda, - const float* t, lapack_int* tsize, - float* c, lapack_int* ldc, - float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_dgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const double* a, lapack_int* lda, - const double* t, lapack_int* tsize, - double* c, lapack_int* ldc, - double* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_cgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const lapack_complex_float* a, lapack_int* lda, - const lapack_complex_float* t, lapack_int* tsize, - lapack_complex_float* c, lapack_int* ldc, - lapack_complex_float* work, lapack_int* lwork, - lapack_int* info ); -void LAPACK_zgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, - const lapack_complex_double* a, lapack_int* lda, - const lapack_complex_double* t, lapack_int* tsize, - lapack_complex_double* c, lapack_int* ldc, - lapack_complex_double* work, lapack_int* lwork, - lapack_int* info ); - -void LAPACK_sgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_cgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); - -void LAPACK_ssyev_2stage( char* jobz, char* uplo, lapack_int* n, float* a, - lapack_int* lda, float* w, float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_dsyev_2stage( char* jobz, char* uplo, lapack_int* n, double* a, - lapack_int* lda, double* w, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_cheev_2stage( char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* w, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zheev_2stage( char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* w, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); -void LAPACK_ssyevd_2stage( char* jobz, char* uplo, lapack_int* n, float* a, - lapack_int* lda, float* w, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dsyevd_2stage( char* jobz, char* uplo, lapack_int* n, double* a, - lapack_int* lda, double* w, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_cheevd_2stage( char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* w, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zheevd_2stage( char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* w, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_ssyevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* vl, float* vu, - lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, float* z, lapack_int* ldz, - float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_dsyevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - double* a, lapack_int* lda, double* vl, double* vu, - lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, double* z, lapack_int* ldz, - double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_cheevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, - lapack_int* lwork, float* rwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_zheevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, - lapack_int* lwork, double* rwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_ssyevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* vl, float* vu, - lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, float* z, lapack_int* ldz, - lapack_int* isuppz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dsyevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - double* a, lapack_int* lda, double* vl, double* vu, - lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, double* z, lapack_int* ldz, - lapack_int* isuppz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_cheevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_int* isuppz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zheevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_int* isuppz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_ssbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - float* ab, lapack_int* ldab, float* w, float* z, - lapack_int* ldz, float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - double* ab, lapack_int* ldab, double* w, double* z, - lapack_int* ldz, double* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_chbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_float* ab, lapack_int* ldab, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int *info ); -void LAPACK_zhbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_double* ab, lapack_int* ldab, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, double* rwork, - lapack_int *info ); -void LAPACK_ssbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - float* ab, lapack_int* ldab, float* w, float* z, - lapack_int* ldz, float* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_dsbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - double* ab, lapack_int* ldab, double* w, double* z, - lapack_int* ldz, double* work, lapack_int* lwork, - lapack_int* iwork, lapack_int* liwork, lapack_int *info ); -void LAPACK_chbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_float* ab, lapack_int* ldab, float* w, - lapack_complex_float* z, lapack_int* ldz, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, - lapack_int *info ); -void LAPACK_zhbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd, - lapack_complex_double* ab, lapack_int* ldab, double* w, - lapack_complex_double* z, lapack_int* ldz, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int* lrwork, lapack_int* iwork, - lapack_int* liwork, lapack_int *info ); -void LAPACK_ssbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, float* ab, lapack_int* ldab, float* q, - lapack_int* ldq, float* vl, float* vu, lapack_int* il, - lapack_int* iu, float* abstol, lapack_int* m, float* w, - float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_dsbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, double* ab, lapack_int* ldab, double* q, - lapack_int* ldq, double* vl, double* vu, lapack_int* il, - lapack_int* iu, double* abstol, lapack_int* m, double* w, - double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork, - lapack_int* ifail, lapack_int *info ); -void LAPACK_chbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab, - lapack_complex_float* q, lapack_int* ldq, float* vl, - float* vu, lapack_int* il, lapack_int* iu, float* abstol, - lapack_int* m, float* w, lapack_complex_float* z, - lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_zhbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n, - lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab, - lapack_complex_double* q, lapack_int* ldq, double* vl, - double* vu, lapack_int* il, lapack_int* iu, double* abstol, - lapack_int* m, double* w, lapack_complex_double* z, - lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork, double* rwork, - lapack_int* iwork, lapack_int* ifail, lapack_int *info ); -void LAPACK_ssygv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* b, lapack_int* ldb, - float* w, float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsygv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - double* a, lapack_int* lda, double* b, lapack_int* ldb, - double* w, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chegv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* w, - lapack_complex_float* work, lapack_int* lwork, float* rwork, - lapack_int *info ); -void LAPACK_zhegv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, double* w, - lapack_complex_double* work, lapack_int* lwork, - double* rwork, lapack_int *info ); - -//LAPACK 3.8.0 - -void LAPACK_ssysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, float* b, lapack_int* ldb, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, double* b, - lapack_int* ldb, double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chesv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_float* b, lapack_int* ldb, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhesv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_double* b, lapack_int* ldb, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); - -void LAPACK_ssytrf_aa_2stage( char* uplo, lapack_int* n, - float* a, lapack_int* lda, float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - float* work, lapack_int* lwork, lapack_int *info ); -void LAPACK_dsytrf_aa_2stage( char* uplo, lapack_int* n, double* a, - lapack_int* lda, double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_csytrf_aa_2stage( char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zsytrf_aa_2stage( char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_chetrf_aa_2stage( char* uplo, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_float* work, lapack_int* lwork, - lapack_int *info ); -void LAPACK_zhetrf_aa_2stage( char* uplo, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_double* work, lapack_int* lwork, - lapack_int *info ); - -void LAPACK_ssytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - float* a, lapack_int* lda, float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_dsytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, double* a, - lapack_int* lda, double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, double* b, - lapack_int* ldb, lapack_int *info ); -void LAPACK_csytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zsytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_chetrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_float* b, lapack_int* ldb, - lapack_int *info ); -void LAPACK_zhetrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* tb, lapack_int* ltb, - lapack_int* ipiv, lapack_int* ipiv2, - lapack_complex_double* b, lapack_int* ldb, - lapack_int *info ); /* APIs for set/get nancheck flags */ void LAPACKE_set_nancheck( int flag ); diff --git a/lapack-netlib/LAPACKE/src/CMakeLists.txt b/lapack-netlib/LAPACKE/src/CMakeLists.txt index 26e52acfa..4c13dce0b 100644 --- a/lapack-netlib/LAPACKE/src/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt @@ -1,4 +1,4 @@ -set(SOURCES +set(SOURCES_COMPLEX lapacke_cbbcsd.c lapacke_cbbcsd_work.c lapacke_cbdsqr.c @@ -78,11 +78,11 @@ lapacke_cgeqrf_work.c lapacke_cgeqrfp.c lapacke_cgeqrfp_work.c lapacke_cgeqrt.c +lapacke_cgeqrt_work.c lapacke_cgeqrt2.c lapacke_cgeqrt2_work.c lapacke_cgeqrt3.c lapacke_cgeqrt3_work.c -lapacke_cgeqrt_work.c lapacke_cgerfs.c lapacke_cgerfs_work.c lapacke_cgerqf.c @@ -93,6 +93,8 @@ lapacke_cgesv.c lapacke_cgesv_work.c lapacke_cgesvd.c lapacke_cgesvd_work.c +lapacke_cgesvdq.c +lapacke_cgesvdq_work.c lapacke_cgesvdx.c lapacke_cgesvdx_work.c lapacke_cgesvj.c @@ -129,10 +131,10 @@ lapacke_cggevx.c lapacke_cggevx_work.c lapacke_cggglm.c lapacke_cggglm_work.c -lapacke_cgghrd.c -lapacke_cgghrd_work.c lapacke_cgghd3.c lapacke_cgghd3_work.c +lapacke_cgghrd.c +lapacke_cgghrd_work.c lapacke_cgglse.c lapacke_cgglse_work.c lapacke_cggqrf.c @@ -157,14 +159,14 @@ lapacke_cgttrs.c lapacke_cgttrs_work.c lapacke_chbev.c lapacke_chbev_work.c -lapacke_chbevd.c -lapacke_chbevd_work.c -lapacke_chbevx.c -lapacke_chbevx_work.c lapacke_chbev_2stage.c lapacke_chbev_2stage_work.c +lapacke_chbevd.c +lapacke_chbevd_work.c lapacke_chbevd_2stage.c lapacke_chbevd_2stage_work.c +lapacke_chbevx.c +lapacke_chbevx_work.c lapacke_chbevx_2stage.c lapacke_chbevx_2stage_work.c lapacke_chbgst.c @@ -185,18 +187,18 @@ lapacke_cheequb.c lapacke_cheequb_work.c lapacke_cheev.c lapacke_cheev_work.c -lapacke_cheevd.c -lapacke_cheevd_work.c -lapacke_cheevr.c -lapacke_cheevr_work.c -lapacke_cheevx.c -lapacke_cheevx_work.c lapacke_cheev_2stage.c lapacke_cheev_2stage_work.c +lapacke_cheevd.c +lapacke_cheevd_work.c lapacke_cheevd_2stage.c lapacke_cheevd_2stage_work.c +lapacke_cheevr.c +lapacke_cheevr_work.c lapacke_cheevr_2stage.c lapacke_cheevr_2stage_work.c +lapacke_cheevx.c +lapacke_cheevx_work.c lapacke_cheevx_2stage.c lapacke_cheevx_2stage_work.c lapacke_chegst.c @@ -214,8 +216,8 @@ lapacke_cherfs_work.c lapacke_chesv.c lapacke_chesv_work.c lapacke_chesv_aa.c -lapacke_chesv_aa_2stage.c lapacke_chesv_aa_work.c +lapacke_chesv_aa_2stage.c lapacke_chesv_aa_2stage_work.c lapacke_chesv_rk.c lapacke_chesv_rk_work.c @@ -226,35 +228,35 @@ lapacke_cheswapr_work.c lapacke_chetrd.c lapacke_chetrd_work.c lapacke_chetrf.c -lapacke_chetrf_rook.c lapacke_chetrf_work.c -lapacke_chetrf_rook_work.c lapacke_chetrf_aa.c -lapacke_chetrf_aa_2stage.c lapacke_chetrf_aa_work.c +lapacke_chetrf_aa_2stage.c lapacke_chetrf_aa_2stage_work.c lapacke_chetrf_rk.c lapacke_chetrf_rk_work.c +lapacke_chetrf_rook.c +lapacke_chetrf_rook_work.c lapacke_chetri.c +lapacke_chetri_work.c lapacke_chetri2.c lapacke_chetri2_work.c -lapacke_chetri_3.c -lapacke_chetri_3_work.c lapacke_chetri2x.c lapacke_chetri2x_work.c -lapacke_chetri_work.c +lapacke_chetri_3.c +lapacke_chetri_3_work.c lapacke_chetrs.c -lapacke_chetrs_rook.c +lapacke_chetrs_work.c lapacke_chetrs2.c lapacke_chetrs2_work.c -lapacke_chetrs_work.c -lapacke_chetrs_rook_work.c +lapacke_chetrs_3.c +lapacke_chetrs_3_work.c lapacke_chetrs_aa.c -lapacke_chetrs_aa_2stage.c lapacke_chetrs_aa_work.c +lapacke_chetrs_aa_2stage.c lapacke_chetrs_aa_2stage_work.c -lapacke_chetrs_3.c -lapacke_chetrs_3_work.c +lapacke_chetrs_rook.c +lapacke_chetrs_rook_work.c lapacke_chfrk.c lapacke_chfrk_work.c lapacke_chgeqz.c @@ -445,52 +447,54 @@ lapacke_csyconv.c lapacke_csyconv_work.c lapacke_csyequb.c lapacke_csyequb_work.c +lapacke_csyr.c +lapacke_csyr_work.c lapacke_csyrfs.c lapacke_csyrfs_work.c lapacke_csysv.c -lapacke_csysv_rook.c -lapacke_csysv_rook_work.c lapacke_csysv_work.c lapacke_csysv_aa.c -lapacke_csysv_aa_2stage.c lapacke_csysv_aa_work.c +lapacke_csysv_aa_2stage.c lapacke_csysv_aa_2stage_work.c lapacke_csysv_rk.c lapacke_csysv_rk_work.c +lapacke_csysv_rook.c +lapacke_csysv_rook_work.c lapacke_csysvx.c lapacke_csysvx_work.c lapacke_csyswapr.c lapacke_csyswapr_work.c lapacke_csytrf.c lapacke_csytrf_work.c -lapacke_csytrf_rook.c -lapacke_csytrf_rook_work.c lapacke_csytrf_aa.c -lapacke_csytrf_aa_2stage.c lapacke_csytrf_aa_work.c +lapacke_csytrf_aa_2stage.c lapacke_csytrf_aa_2stage_work.c lapacke_csytrf_rk.c lapacke_csytrf_rk_work.c +lapacke_csytrf_rook.c +lapacke_csytrf_rook_work.c lapacke_csytri.c +lapacke_csytri_work.c lapacke_csytri2.c lapacke_csytri2_work.c -lapacke_csytri_3.c -lapacke_csytri_3_work.c lapacke_csytri2x.c lapacke_csytri2x_work.c -lapacke_csytri_work.c +lapacke_csytri_3.c +lapacke_csytri_3_work.c lapacke_csytrs.c -lapacke_csytrs_rook.c +lapacke_csytrs_work.c lapacke_csytrs2.c lapacke_csytrs2_work.c -lapacke_csytrs_work.c -lapacke_csytrs_rook_work.c +lapacke_csytrs_3.c +lapacke_csytrs_3_work.c lapacke_csytrs_aa.c -lapacke_csytrs_aa_2stage.c lapacke_csytrs_aa_work.c +lapacke_csytrs_aa_2stage.c lapacke_csytrs_aa_2stage_work.c -lapacke_csytrs_3.c -lapacke_csytrs_3_work.c +lapacke_csytrs_rook.c +lapacke_csytrs_rook_work.c lapacke_ctbcon.c lapacke_ctbcon_work.c lapacke_ctbrfs.c @@ -522,9 +526,9 @@ lapacke_ctpcon_work.c lapacke_ctpmqrt.c lapacke_ctpmqrt_work.c lapacke_ctpqrt.c +lapacke_ctpqrt_work.c lapacke_ctpqrt2.c lapacke_ctpqrt2_work.c -lapacke_ctpqrt_work.c lapacke_ctprfb.c lapacke_ctprfb_work.c lapacke_ctprfs.c @@ -601,14 +605,16 @@ lapacke_cupgtr.c lapacke_cupgtr_work.c lapacke_cupmtr.c lapacke_cupmtr_work.c +) +set(SOURCES_DOUBLE lapacke_dbbcsd.c lapacke_dbbcsd_work.c lapacke_dbdsdc.c lapacke_dbdsdc_work.c -lapacke_dbdsvdx.c -lapacke_dbdsvdx_work.c lapacke_dbdsqr.c lapacke_dbdsqr_work.c +lapacke_dbdsvdx.c +lapacke_dbdsvdx_work.c lapacke_ddisna.c lapacke_ddisna_work.c lapacke_dgbbrd.c @@ -686,11 +692,11 @@ lapacke_dgeqrf_work.c lapacke_dgeqrfp.c lapacke_dgeqrfp_work.c lapacke_dgeqrt.c +lapacke_dgeqrt_work.c lapacke_dgeqrt2.c lapacke_dgeqrt2_work.c lapacke_dgeqrt3.c lapacke_dgeqrt3_work.c -lapacke_dgeqrt_work.c lapacke_dgerfs.c lapacke_dgerfs_work.c lapacke_dgerqf.c @@ -701,6 +707,8 @@ lapacke_dgesv.c lapacke_dgesv_work.c lapacke_dgesvd.c lapacke_dgesvd_work.c +lapacke_dgesvdq.c +lapacke_dgesvdq_work.c lapacke_dgesvdx.c lapacke_dgesvdx_work.c lapacke_dgesvj.c @@ -737,10 +745,10 @@ lapacke_dggevx.c lapacke_dggevx_work.c lapacke_dggglm.c lapacke_dggglm_work.c -lapacke_dgghrd.c -lapacke_dgghrd_work.c lapacke_dgghd3.c lapacke_dgghd3_work.c +lapacke_dgghrd.c +lapacke_dgghrd_work.c lapacke_dgglse.c lapacke_dgglse_work.c lapacke_dggqrf.c @@ -823,10 +831,10 @@ lapacke_dopmtr.c lapacke_dopmtr_work.c lapacke_dorbdb.c lapacke_dorbdb_work.c -lapacke_dorcsd2by1.c -lapacke_dorcsd2by1_work.c lapacke_dorcsd.c lapacke_dorcsd_work.c +lapacke_dorcsd2by1.c +lapacke_dorcsd2by1_work.c lapacke_dorgbr.c lapacke_dorgbr_work.c lapacke_dorghr.c @@ -933,14 +941,14 @@ lapacke_dpttrs.c lapacke_dpttrs_work.c lapacke_dsbev.c lapacke_dsbev_work.c -lapacke_dsbevd.c -lapacke_dsbevd_work.c -lapacke_dsbevx.c -lapacke_dsbevx_work.c lapacke_dsbev_2stage.c lapacke_dsbev_2stage_work.c +lapacke_dsbevd.c +lapacke_dsbevd_work.c lapacke_dsbevd_2stage.c lapacke_dsbevd_2stage_work.c +lapacke_dsbevx.c +lapacke_dsbevx_work.c lapacke_dsbevx_2stage.c lapacke_dsbevx_2stage_work.c lapacke_dsbgst.c @@ -1021,18 +1029,18 @@ lapacke_dsyequb.c lapacke_dsyequb_work.c lapacke_dsyev.c lapacke_dsyev_work.c -lapacke_dsyevd.c -lapacke_dsyevd_work.c -lapacke_dsyevr.c -lapacke_dsyevr_work.c -lapacke_dsyevx.c -lapacke_dsyevx_work.c lapacke_dsyev_2stage.c lapacke_dsyev_2stage_work.c +lapacke_dsyevd.c +lapacke_dsyevd_work.c lapacke_dsyevd_2stage.c lapacke_dsyevd_2stage_work.c +lapacke_dsyevr.c +lapacke_dsyevr_work.c lapacke_dsyevr_2stage.c lapacke_dsyevr_2stage_work.c +lapacke_dsyevx.c +lapacke_dsyevx_work.c lapacke_dsyevx_2stage.c lapacke_dsyevx_2stage_work.c lapacke_dsygst.c @@ -1048,15 +1056,15 @@ lapacke_dsygvx_work.c lapacke_dsyrfs.c lapacke_dsyrfs_work.c lapacke_dsysv.c -lapacke_dsysv_rook.c -lapacke_dsysv_rook_work.c lapacke_dsysv_work.c lapacke_dsysv_aa.c -lapacke_dsysv_aa_2stage.c lapacke_dsysv_aa_work.c +lapacke_dsysv_aa_2stage.c lapacke_dsysv_aa_2stage_work.c lapacke_dsysv_rk.c lapacke_dsysv_rk_work.c +lapacke_dsysv_rook.c +lapacke_dsysv_rook_work.c lapacke_dsysvx.c lapacke_dsysvx_work.c lapacke_dsyswapr.c @@ -1065,33 +1073,33 @@ lapacke_dsytrd.c lapacke_dsytrd_work.c lapacke_dsytrf.c lapacke_dsytrf_work.c -lapacke_dsytrf_rook.c -lapacke_dsytrf_rook_work.c lapacke_dsytrf_aa.c -lapacke_dsytrf_aa_2stage.c lapacke_dsytrf_aa_work.c +lapacke_dsytrf_aa_2stage.c lapacke_dsytrf_aa_2stage_work.c lapacke_dsytrf_rk.c lapacke_dsytrf_rk_work.c +lapacke_dsytrf_rook.c +lapacke_dsytrf_rook_work.c lapacke_dsytri.c +lapacke_dsytri_work.c lapacke_dsytri2.c lapacke_dsytri2_work.c -lapacke_dsytri_3.c -lapacke_dsytri_3_work.c lapacke_dsytri2x.c lapacke_dsytri2x_work.c -lapacke_dsytri_work.c +lapacke_dsytri_3.c +lapacke_dsytri_3_work.c lapacke_dsytrs.c -lapacke_dsytrs_rook.c +lapacke_dsytrs_work.c lapacke_dsytrs2.c lapacke_dsytrs2_work.c +lapacke_dsytrs_3.c +lapacke_dsytrs_3_work.c lapacke_dsytrs_aa.c -lapacke_dsytrs_aa_2stage.c lapacke_dsytrs_aa_work.c +lapacke_dsytrs_aa_2stage.c lapacke_dsytrs_aa_2stage_work.c -lapacke_dsytrs_3.c -lapacke_dsytrs_3_work.c -lapacke_dsytrs_work.c +lapacke_dsytrs_rook.c lapacke_dsytrs_rook_work.c lapacke_dtbcon.c lapacke_dtbcon_work.c @@ -1124,9 +1132,9 @@ lapacke_dtpcon_work.c lapacke_dtpmqrt.c lapacke_dtpmqrt_work.c lapacke_dtpqrt.c +lapacke_dtpqrt_work.c lapacke_dtpqrt2.c lapacke_dtpqrt2_work.c -lapacke_dtpqrt_work.c lapacke_dtprfb.c lapacke_dtprfb_work.c lapacke_dtprfs.c @@ -1163,15 +1171,21 @@ lapacke_dtrttp.c lapacke_dtrttp_work.c lapacke_dtzrzf.c lapacke_dtzrzf_work.c +) + +set(SOURCES lapacke_nancheck.c +lapacke_ilaver.c +) +set(SOURCES_SINGLE lapacke_sbbcsd.c lapacke_sbbcsd_work.c lapacke_sbdsdc.c lapacke_sbdsdc_work.c -lapacke_sbdsvdx.c -lapacke_sbdsvdx_work.c lapacke_sbdsqr.c lapacke_sbdsqr_work.c +lapacke_sbdsvdx.c +lapacke_sbdsvdx_work.c lapacke_sdisna.c lapacke_sdisna_work.c lapacke_sgbbrd.c @@ -1249,11 +1263,11 @@ lapacke_sgeqrf_work.c lapacke_sgeqrfp.c lapacke_sgeqrfp_work.c lapacke_sgeqrt.c +lapacke_sgeqrt_work.c lapacke_sgeqrt2.c lapacke_sgeqrt2_work.c lapacke_sgeqrt3.c lapacke_sgeqrt3_work.c -lapacke_sgeqrt_work.c lapacke_sgerfs.c lapacke_sgerfs_work.c lapacke_sgerqf.c @@ -1264,6 +1278,8 @@ lapacke_sgesv.c lapacke_sgesv_work.c lapacke_sgesvd.c lapacke_sgesvd_work.c +lapacke_sgesvdq.c +lapacke_sgesvdq_work.c lapacke_sgesvdx.c lapacke_sgesvdx_work.c lapacke_sgesvj.c @@ -1300,10 +1316,10 @@ lapacke_sggevx.c lapacke_sggevx_work.c lapacke_sggglm.c lapacke_sggglm_work.c -lapacke_sgghrd.c -lapacke_sgghrd_work.c lapacke_sgghd3.c lapacke_sgghd3_work.c +lapacke_sgghrd.c +lapacke_sgghrd_work.c lapacke_sgglse.c lapacke_sgglse_work.c lapacke_sggqrf.c @@ -1496,14 +1512,14 @@ lapacke_spttrs.c lapacke_spttrs_work.c lapacke_ssbev.c lapacke_ssbev_work.c -lapacke_ssbevd.c -lapacke_ssbevd_work.c -lapacke_ssbevx.c -lapacke_ssbevx_work.c lapacke_ssbev_2stage.c lapacke_ssbev_2stage_work.c +lapacke_ssbevd.c +lapacke_ssbevd_work.c lapacke_ssbevd_2stage.c lapacke_ssbevd_2stage_work.c +lapacke_ssbevx.c +lapacke_ssbevx_work.c lapacke_ssbevx_2stage.c lapacke_ssbevx_2stage_work.c lapacke_ssbgst.c @@ -1580,18 +1596,18 @@ lapacke_ssyequb.c lapacke_ssyequb_work.c lapacke_ssyev.c lapacke_ssyev_work.c -lapacke_ssyevd.c -lapacke_ssyevd_work.c -lapacke_ssyevr.c -lapacke_ssyevr_work.c -lapacke_ssyevx.c -lapacke_ssyevx_work.c lapacke_ssyev_2stage.c lapacke_ssyev_2stage_work.c +lapacke_ssyevd.c +lapacke_ssyevd_work.c lapacke_ssyevd_2stage.c lapacke_ssyevd_2stage_work.c +lapacke_ssyevr.c +lapacke_ssyevr_work.c lapacke_ssyevr_2stage.c lapacke_ssyevr_2stage_work.c +lapacke_ssyevx.c +lapacke_ssyevx_work.c lapacke_ssyevx_2stage.c lapacke_ssyevx_2stage_work.c lapacke_ssygst.c @@ -1607,8 +1623,6 @@ lapacke_ssygvx_work.c lapacke_ssyrfs.c lapacke_ssyrfs_work.c lapacke_ssysv.c -lapacke_ssysv_rook.c -lapacke_ssysv_rook_work.c lapacke_ssysv_work.c lapacke_ssysv_aa.c lapacke_ssysv_aa_work.c @@ -1616,6 +1630,8 @@ lapacke_ssysv_aa_2stage.c lapacke_ssysv_aa_2stage_work.c lapacke_ssysv_rk.c lapacke_ssysv_rk_work.c +lapacke_ssysv_rook.c +lapacke_ssysv_rook_work.c lapacke_ssysvx.c lapacke_ssysvx_work.c lapacke_ssyswapr.c @@ -1624,33 +1640,33 @@ lapacke_ssytrd.c lapacke_ssytrd_work.c lapacke_ssytrf.c lapacke_ssytrf_work.c -lapacke_ssytrf_rook.c -lapacke_ssytrf_rook_work.c lapacke_ssytrf_aa.c -lapacke_ssytrf_aa_2stage.c lapacke_ssytrf_aa_work.c +lapacke_ssytrf_aa_2stage.c lapacke_ssytrf_aa_2stage_work.c lapacke_ssytrf_rk.c lapacke_ssytrf_rk_work.c +lapacke_ssytrf_rook.c +lapacke_ssytrf_rook_work.c lapacke_ssytri.c +lapacke_ssytri_work.c lapacke_ssytri2.c lapacke_ssytri2_work.c -lapacke_ssytri_3.c -lapacke_ssytri_3_work.c lapacke_ssytri2x.c lapacke_ssytri2x_work.c -lapacke_ssytri_work.c +lapacke_ssytri_3.c +lapacke_ssytri_3_work.c lapacke_ssytrs.c -lapacke_ssytrs_rook.c +lapacke_ssytrs_work.c lapacke_ssytrs2.c lapacke_ssytrs2_work.c +lapacke_ssytrs_3.c +lapacke_ssytrs_3_work.c lapacke_ssytrs_aa.c -lapacke_ssytrs_aa_2stage.c lapacke_ssytrs_aa_work.c +lapacke_ssytrs_aa_2stage.c lapacke_ssytrs_aa_2stage_work.c -lapacke_ssytrs_3.c -lapacke_ssytrs_3_work.c -lapacke_ssytrs_work.c +lapacke_ssytrs_rook.c lapacke_ssytrs_rook_work.c lapacke_stbcon.c lapacke_stbcon_work.c @@ -1722,6 +1738,8 @@ lapacke_strttp.c lapacke_strttp_work.c lapacke_stzrzf.c lapacke_stzrzf_work.c +) +set(SOURCES_COMPLEX16 lapacke_zbbcsd.c lapacke_zbbcsd_work.c lapacke_zbdsqr.c @@ -1805,11 +1823,11 @@ lapacke_zgeqrf_work.c lapacke_zgeqrfp.c lapacke_zgeqrfp_work.c lapacke_zgeqrt.c +lapacke_zgeqrt_work.c lapacke_zgeqrt2.c lapacke_zgeqrt2_work.c lapacke_zgeqrt3.c lapacke_zgeqrt3_work.c -lapacke_zgeqrt_work.c lapacke_zgerfs.c lapacke_zgerfs_work.c lapacke_zgerqf.c @@ -1820,6 +1838,8 @@ lapacke_zgesv.c lapacke_zgesv_work.c lapacke_zgesvd.c lapacke_zgesvd_work.c +lapacke_zgesvdq.c +lapacke_zgesvdq_work.c lapacke_zgesvdx.c lapacke_zgesvdx_work.c lapacke_zgesvj.c @@ -1856,10 +1876,10 @@ lapacke_zggevx.c lapacke_zggevx_work.c lapacke_zggglm.c lapacke_zggglm_work.c -lapacke_zgghrd.c -lapacke_zgghrd_work.c lapacke_zgghd3.c lapacke_zgghd3_work.c +lapacke_zgghrd.c +lapacke_zgghrd_work.c lapacke_zgglse.c lapacke_zgglse_work.c lapacke_zggqrf.c @@ -1884,14 +1904,14 @@ lapacke_zgttrs.c lapacke_zgttrs_work.c lapacke_zhbev.c lapacke_zhbev_work.c -lapacke_zhbevd.c -lapacke_zhbevd_work.c -lapacke_zhbevx.c -lapacke_zhbevx_work.c lapacke_zhbev_2stage.c lapacke_zhbev_2stage_work.c +lapacke_zhbevd.c +lapacke_zhbevd_work.c lapacke_zhbevd_2stage.c lapacke_zhbevd_2stage_work.c +lapacke_zhbevx.c +lapacke_zhbevx_work.c lapacke_zhbevx_2stage.c lapacke_zhbevx_2stage_work.c lapacke_zhbgst.c @@ -1912,18 +1932,18 @@ lapacke_zheequb.c lapacke_zheequb_work.c lapacke_zheev.c lapacke_zheev_work.c -lapacke_zheevd.c -lapacke_zheevd_work.c -lapacke_zheevr.c -lapacke_zheevr_work.c -lapacke_zheevx.c -lapacke_zheevx_work.c lapacke_zheev_2stage.c lapacke_zheev_2stage_work.c +lapacke_zheevd.c +lapacke_zheevd_work.c lapacke_zheevd_2stage.c lapacke_zheevd_2stage_work.c +lapacke_zheevr.c +lapacke_zheevr_work.c lapacke_zheevr_2stage.c lapacke_zheevr_2stage_work.c +lapacke_zheevx.c +lapacke_zheevx_work.c lapacke_zheevx_2stage.c lapacke_zheevx_2stage_work.c lapacke_zhegst.c @@ -1941,8 +1961,8 @@ lapacke_zherfs_work.c lapacke_zhesv.c lapacke_zhesv_work.c lapacke_zhesv_aa.c -lapacke_zhesv_aa_2stage.c lapacke_zhesv_aa_work.c +lapacke_zhesv_aa_2stage.c lapacke_zhesv_aa_2stage_work.c lapacke_zhesv_rk.c lapacke_zhesv_rk_work.c @@ -1953,34 +1973,34 @@ lapacke_zheswapr_work.c lapacke_zhetrd.c lapacke_zhetrd_work.c lapacke_zhetrf.c -lapacke_zhetrf_rook.c lapacke_zhetrf_work.c -lapacke_zhetrf_rook_work.c lapacke_zhetrf_aa.c -lapacke_zhetrf_aa_2stage.c lapacke_zhetrf_aa_work.c +lapacke_zhetrf_aa_2stage.c lapacke_zhetrf_aa_2stage_work.c lapacke_zhetrf_rk.c lapacke_zhetrf_rk_work.c +lapacke_zhetrf_rook.c +lapacke_zhetrf_rook_work.c lapacke_zhetri.c +lapacke_zhetri_work.c lapacke_zhetri2.c lapacke_zhetri2_work.c -lapacke_zhetri_3.c -lapacke_zhetri_3_work.c lapacke_zhetri2x.c lapacke_zhetri2x_work.c -lapacke_zhetri_work.c +lapacke_zhetri_3.c +lapacke_zhetri_3_work.c lapacke_zhetrs.c -lapacke_zhetrs_rook.c +lapacke_zhetrs_work.c lapacke_zhetrs2.c lapacke_zhetrs2_work.c -lapacke_zhetrs_work.c +lapacke_zhetrs_3.c +lapacke_zhetrs_3_work.c lapacke_zhetrs_aa.c -lapacke_zhetrs_aa_2stage.c lapacke_zhetrs_aa_work.c +lapacke_zhetrs_aa_2stage.c lapacke_zhetrs_aa_2stage_work.c -lapacke_zhetrs_3.c -lapacke_zhetrs_3_work.c +lapacke_zhetrs_rook.c lapacke_zhetrs_rook_work.c lapacke_zhfrk.c lapacke_zhfrk_work.c @@ -2172,52 +2192,54 @@ lapacke_zsyconv.c lapacke_zsyconv_work.c lapacke_zsyequb.c lapacke_zsyequb_work.c +lapacke_zsyr.c +lapacke_zsyr_work.c lapacke_zsyrfs.c lapacke_zsyrfs_work.c lapacke_zsysv.c -lapacke_zsysv_rook.c -lapacke_zsysv_rook_work.c lapacke_zsysv_work.c lapacke_zsysv_aa.c -lapacke_zsysv_aa_2stage.c lapacke_zsysv_aa_work.c +lapacke_zsysv_aa_2stage.c lapacke_zsysv_aa_2stage_work.c lapacke_zsysv_rk.c lapacke_zsysv_rk_work.c +lapacke_zsysv_rook.c +lapacke_zsysv_rook_work.c lapacke_zsysvx.c lapacke_zsysvx_work.c lapacke_zsyswapr.c lapacke_zsyswapr_work.c lapacke_zsytrf.c lapacke_zsytrf_work.c -lapacke_zsytrf_rook.c -lapacke_zsytrf_rook_work.c lapacke_zsytrf_aa.c -lapacke_zsytrf_aa_2stage.c lapacke_zsytrf_aa_work.c +lapacke_zsytrf_aa_2stage.c lapacke_zsytrf_aa_2stage_work.c lapacke_zsytrf_rk.c lapacke_zsytrf_rk_work.c +lapacke_zsytrf_rook.c +lapacke_zsytrf_rook_work.c lapacke_zsytri.c +lapacke_zsytri_work.c lapacke_zsytri2.c lapacke_zsytri2_work.c -lapacke_zsytri_3.c -lapacke_zsytri_3_work.c lapacke_zsytri2x.c lapacke_zsytri2x_work.c -lapacke_zsytri_work.c +lapacke_zsytri_3.c +lapacke_zsytri_3_work.c lapacke_zsytrs.c -lapacke_zsytrs_rook.c +lapacke_zsytrs_work.c lapacke_zsytrs2.c lapacke_zsytrs2_work.c -lapacke_zsytrs_work.c -lapacke_zsytrs_rook_work.c +lapacke_zsytrs_3.c +lapacke_zsytrs_3_work.c lapacke_zsytrs_aa.c -lapacke_zsytrs_aa_2stage.c lapacke_zsytrs_aa_work.c +lapacke_zsytrs_aa_2stage.c lapacke_zsytrs_aa_2stage_work.c -lapacke_zsytrs_3.c -lapacke_zsytrs_3_work.c +lapacke_zsytrs_rook.c +lapacke_zsytrs_rook_work.c lapacke_ztbcon.c lapacke_ztbcon_work.c lapacke_ztbrfs.c @@ -2249,9 +2271,9 @@ lapacke_ztpcon_work.c lapacke_ztpmqrt.c lapacke_ztpmqrt_work.c lapacke_ztpqrt.c +lapacke_ztpqrt_work.c lapacke_ztpqrt2.c lapacke_ztpqrt2_work.c -lapacke_ztpqrt_work.c lapacke_ztprfb.c lapacke_ztprfb_work.c lapacke_ztprfs.c @@ -2328,11 +2350,6 @@ lapacke_zupgtr.c lapacke_zupgtr_work.c lapacke_zupmtr.c lapacke_zupmtr_work.c -lapacke_zsyr.c -lapacke_csyr.c -lapacke_zsyr_work.c -lapacke_csyr_work.c -lapacke_ilaver.c ) set(DEPRECATED diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 7672f9f73..8060151ae 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -32,12 +32,21 @@ ############################################################################## # makefile for LAPACKE, used to build lapacke binary. # -# Note: we use multiple OBJ_A, OBJ_B, etc, instead of a single OBJ +# Note: we use multiple OBJ_S, OBJ_C, etc, instead of a single OBJ # to allow build with mingw (argument list too long for the msys ar) # -include ../../make.inc +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc -OBJ_A = \ +.SUFFIXES: .c .o +.c.o: + $(CC) $(CFLAGS) -I../include -c -o $@ $< + +OBJ = \ +lapacke_ilaver.o \ +lapacke_nancheck.o + +OBJ_C = \ lapacke_cbbcsd.o \ lapacke_cbbcsd_work.o \ lapacke_cbdsqr.o \ @@ -82,12 +91,12 @@ lapacke_cgeevx.o \ lapacke_cgeevx_work.o \ lapacke_cgehrd.o \ lapacke_cgehrd_work.o \ +lapacke_cgejsv.o \ +lapacke_cgejsv_work.o \ lapacke_cgelq.o \ lapacke_cgelq_work.o \ lapacke_cgelq2.o \ lapacke_cgelq2_work.o \ -lapacke_cgejsv.o \ -lapacke_cgejsv_work.o \ lapacke_cgelqf.o \ lapacke_cgelqf_work.o \ lapacke_cgels.o \ @@ -117,11 +126,11 @@ lapacke_cgeqrf_work.o \ lapacke_cgeqrfp.o \ lapacke_cgeqrfp_work.o \ lapacke_cgeqrt.o \ +lapacke_cgeqrt_work.o \ lapacke_cgeqrt2.o \ lapacke_cgeqrt2_work.o \ lapacke_cgeqrt3.o \ lapacke_cgeqrt3_work.o \ -lapacke_cgeqrt_work.o \ lapacke_cgerfs.o \ lapacke_cgerfs_work.o \ lapacke_cgerqf.o \ @@ -132,6 +141,8 @@ lapacke_cgesv.o \ lapacke_cgesv_work.o \ lapacke_cgesvd.o \ lapacke_cgesvd_work.o \ +lapacke_cgesvdq.o \ +lapacke_cgesvdq_work.o \ lapacke_cgesvdx.o \ lapacke_cgesvdx_work.o \ lapacke_cgesvj.o \ @@ -168,10 +179,10 @@ lapacke_cggevx.o \ lapacke_cggevx_work.o \ lapacke_cggglm.o \ lapacke_cggglm_work.o \ -lapacke_cgghrd.o \ -lapacke_cgghrd_work.o \ lapacke_cgghd3.o \ lapacke_cgghd3_work.o \ +lapacke_cgghrd.o \ +lapacke_cgghrd_work.o \ lapacke_cgglse.o \ lapacke_cgglse_work.o \ lapacke_cggqrf.o \ @@ -196,14 +207,14 @@ lapacke_cgttrs.o \ lapacke_cgttrs_work.o \ lapacke_chbev.o \ lapacke_chbev_work.o \ -lapacke_chbevd.o \ -lapacke_chbevd_work.o \ -lapacke_chbevx.o \ -lapacke_chbevx_work.o \ lapacke_chbev_2stage.o \ lapacke_chbev_2stage_work.o \ +lapacke_chbevd.o \ +lapacke_chbevd_work.o \ lapacke_chbevd_2stage.o \ lapacke_chbevd_2stage_work.o \ +lapacke_chbevx.o \ +lapacke_chbevx_work.o \ lapacke_chbevx_2stage.o \ lapacke_chbevx_2stage_work.o \ lapacke_chbgst.o \ @@ -224,18 +235,18 @@ lapacke_cheequb.o \ lapacke_cheequb_work.o \ lapacke_cheev.o \ lapacke_cheev_work.o \ -lapacke_cheevd.o \ -lapacke_cheevd_work.o \ -lapacke_cheevr.o \ -lapacke_cheevr_work.o \ -lapacke_cheevx.o \ -lapacke_cheevx_work.o \ lapacke_cheev_2stage.o \ lapacke_cheev_2stage_work.o \ +lapacke_cheevd.o \ +lapacke_cheevd_work.o \ lapacke_cheevd_2stage.o \ lapacke_cheevd_2stage_work.o \ +lapacke_cheevr.o \ +lapacke_cheevr_work.o \ lapacke_cheevr_2stage.o \ lapacke_cheevr_2stage_work.o \ +lapacke_cheevx.o \ +lapacke_cheevx_work.o \ lapacke_cheevx_2stage.o \ lapacke_cheevx_2stage_work.o \ lapacke_chegst.o \ @@ -265,35 +276,35 @@ lapacke_cheswapr_work.o \ lapacke_chetrd.o \ lapacke_chetrd_work.o \ lapacke_chetrf.o \ -lapacke_chetrf_rook.o \ lapacke_chetrf_work.o \ -lapacke_chetrf_rook_work.o \ lapacke_chetrf_aa.o \ -lapacke_chetrf_aa_2stage.o \ lapacke_chetrf_aa_work.o \ +lapacke_chetrf_aa_2stage.o \ lapacke_chetrf_aa_2stage_work.o \ lapacke_chetrf_rk.o \ lapacke_chetrf_rk_work.o \ +lapacke_chetrf_rook.o \ +lapacke_chetrf_rook_work.o \ lapacke_chetri.o \ +lapacke_chetri_work.o \ lapacke_chetri2.o \ lapacke_chetri2_work.o \ -lapacke_chetri_3.o \ -lapacke_chetri_3_work.o \ lapacke_chetri2x.o \ lapacke_chetri2x_work.o \ -lapacke_chetri_work.o \ +lapacke_chetri_3.o \ +lapacke_chetri_3_work.o \ lapacke_chetrs.o \ -lapacke_chetrs_rook.o \ +lapacke_chetrs_work.o \ lapacke_chetrs2.o \ lapacke_chetrs2_work.o \ -lapacke_chetrs_work.o \ -lapacke_chetrs_rook_work.o \ +lapacke_chetrs_3.o \ +lapacke_chetrs_3_work.o \ lapacke_chetrs_aa.o \ -lapacke_chetrs_aa_2stage.o \ lapacke_chetrs_aa_work.o \ +lapacke_chetrs_aa_2stage.o \ lapacke_chetrs_aa_2stage_work.o \ -lapacke_chetrs_3.o \ -lapacke_chetrs_3_work.o \ +lapacke_chetrs_rook.o \ +lapacke_chetrs_rook_work.o \ lapacke_chfrk.o \ lapacke_chfrk_work.o \ lapacke_chgeqz.o \ @@ -484,11 +495,11 @@ lapacke_csyconv.o \ lapacke_csyconv_work.o \ lapacke_csyequb.o \ lapacke_csyequb_work.o \ +lapacke_csyr.o \ +lapacke_csyr_work.o \ lapacke_csyrfs.o \ lapacke_csyrfs_work.o \ lapacke_csysv.o \ -lapacke_csysv_rook.o \ -lapacke_csysv_rook_work.o \ lapacke_csysv_work.o \ lapacke_csysv_aa.o \ lapacke_csysv_aa_work.o \ @@ -496,40 +507,42 @@ lapacke_csysv_aa_2stage.o \ lapacke_csysv_aa_2stage_work.o \ lapacke_csysv_rk.o \ lapacke_csysv_rk_work.o \ +lapacke_csysv_rook.o \ +lapacke_csysv_rook_work.o \ lapacke_csysvx.o \ lapacke_csysvx_work.o \ lapacke_csyswapr.o \ lapacke_csyswapr_work.o \ lapacke_csytrf.o \ lapacke_csytrf_work.o \ -lapacke_csytrf_rook.o \ -lapacke_csytrf_rook_work.o \ lapacke_csytrf_aa.o \ -lapacke_csytrf_aa_2stage.o \ lapacke_csytrf_aa_work.o \ +lapacke_csytrf_aa_2stage.o \ lapacke_csytrf_aa_2stage_work.o \ lapacke_csytrf_rk.o \ lapacke_csytrf_rk_work.o \ +lapacke_csytrf_rook.o \ +lapacke_csytrf_rook_work.o \ lapacke_csytri.o \ +lapacke_csytri_work.o \ lapacke_csytri2.o \ lapacke_csytri2_work.o \ -lapacke_csytri_3.o \ -lapacke_csytri_3_work.o \ lapacke_csytri2x.o \ lapacke_csytri2x_work.o \ -lapacke_csytri_work.o \ +lapacke_csytri_3.o \ +lapacke_csytri_3_work.o \ lapacke_csytrs.o \ -lapacke_csytrs_rook.o \ +lapacke_csytrs_work.o \ lapacke_csytrs2.o \ lapacke_csytrs2_work.o \ -lapacke_csytrs_work.o \ -lapacke_csytrs_rook_work.o \ +lapacke_csytrs_3.o \ +lapacke_csytrs_3_work.o \ lapacke_csytrs_aa.o \ -lapacke_csytrs_aa_2stage.o \ lapacke_csytrs_aa_work.o \ +lapacke_csytrs_aa_2stage.o \ lapacke_csytrs_aa_2stage_work.o \ -lapacke_csytrs_3.o \ -lapacke_csytrs_3_work.o \ +lapacke_csytrs_rook.o \ +lapacke_csytrs_rook_work.o \ lapacke_ctbcon.o \ lapacke_ctbcon_work.o \ lapacke_ctbrfs.o \ @@ -561,9 +574,9 @@ lapacke_ctpcon_work.o \ lapacke_ctpmqrt.o \ lapacke_ctpmqrt_work.o \ lapacke_ctpqrt.o \ +lapacke_ctpqrt_work.o \ lapacke_ctpqrt2.o \ lapacke_ctpqrt2_work.o \ -lapacke_ctpqrt_work.o \ lapacke_ctprfb.o \ lapacke_ctprfb_work.o \ lapacke_ctprfs.o \ @@ -639,15 +652,17 @@ lapacke_cunmtr_work.o \ lapacke_cupgtr.o \ lapacke_cupgtr_work.o \ lapacke_cupmtr.o \ -lapacke_cupmtr_work.o \ +lapacke_cupmtr_work.o + +OBJ_D = \ lapacke_dbbcsd.o \ lapacke_dbbcsd_work.o \ lapacke_dbdsdc.o \ lapacke_dbdsdc_work.o \ -lapacke_dbdsvdx.o \ -lapacke_dbdsvdx_work.o \ lapacke_dbdsqr.o \ lapacke_dbdsqr_work.o \ +lapacke_dbdsvdx.o \ +lapacke_dbdsvdx_work.o \ lapacke_ddisna.o \ lapacke_ddisna_work.o \ lapacke_dgbbrd.o \ @@ -725,11 +740,11 @@ lapacke_dgeqrf_work.o \ lapacke_dgeqrfp.o \ lapacke_dgeqrfp_work.o \ lapacke_dgeqrt.o \ +lapacke_dgeqrt_work.o \ lapacke_dgeqrt2.o \ lapacke_dgeqrt2_work.o \ lapacke_dgeqrt3.o \ lapacke_dgeqrt3_work.o \ -lapacke_dgeqrt_work.o \ lapacke_dgerfs.o \ lapacke_dgerfs_work.o \ lapacke_dgerqf.o \ @@ -740,6 +755,8 @@ lapacke_dgesv.o \ lapacke_dgesv_work.o \ lapacke_dgesvd.o \ lapacke_dgesvd_work.o \ +lapacke_dgesvdq.o \ +lapacke_dgesvdq_work.o \ lapacke_dgesvdx.o \ lapacke_dgesvdx_work.o \ lapacke_dgesvj.o \ @@ -776,10 +793,10 @@ lapacke_dggevx.o \ lapacke_dggevx_work.o \ lapacke_dggglm.o \ lapacke_dggglm_work.o \ -lapacke_dgghrd.o \ -lapacke_dgghrd_work.o \ lapacke_dgghd3.o \ lapacke_dgghd3_work.o \ +lapacke_dgghrd.o \ +lapacke_dgghrd_work.o \ lapacke_dgglse.o \ lapacke_dgglse_work.o \ lapacke_dggqrf.o \ @@ -972,14 +989,14 @@ lapacke_dpttrs.o \ lapacke_dpttrs_work.o \ lapacke_dsbev.o \ lapacke_dsbev_work.o \ -lapacke_dsbevd.o \ -lapacke_dsbevd_work.o \ -lapacke_dsbevx.o \ -lapacke_dsbevx_work.o \ lapacke_dsbev_2stage.o \ lapacke_dsbev_2stage_work.o \ +lapacke_dsbevd.o \ +lapacke_dsbevd_work.o \ lapacke_dsbevd_2stage.o \ lapacke_dsbevd_2stage_work.o \ +lapacke_dsbevx.o \ +lapacke_dsbevx_work.o \ lapacke_dsbevx_2stage.o \ lapacke_dsbevx_2stage_work.o \ lapacke_dsbgst.o \ @@ -1060,18 +1077,18 @@ lapacke_dsyequb.o \ lapacke_dsyequb_work.o \ lapacke_dsyev.o \ lapacke_dsyev_work.o \ -lapacke_dsyevd.o \ -lapacke_dsyevd_work.o \ -lapacke_dsyevr.o \ -lapacke_dsyevr_work.o \ -lapacke_dsyevx.o \ -lapacke_dsyevx_work.o \ lapacke_dsyev_2stage.o \ lapacke_dsyev_2stage_work.o \ +lapacke_dsyevd.o \ +lapacke_dsyevd_work.o \ lapacke_dsyevd_2stage.o \ lapacke_dsyevd_2stage_work.o \ +lapacke_dsyevr.o \ +lapacke_dsyevr_work.o \ lapacke_dsyevr_2stage.o \ lapacke_dsyevr_2stage_work.o \ +lapacke_dsyevx.o \ +lapacke_dsyevx_work.o \ lapacke_dsyevx_2stage.o \ lapacke_dsyevx_2stage_work.o \ lapacke_dsygst.o \ @@ -1087,8 +1104,6 @@ lapacke_dsygvx_work.o \ lapacke_dsyrfs.o \ lapacke_dsyrfs_work.o \ lapacke_dsysv.o \ -lapacke_dsysv_rook.o \ -lapacke_dsysv_rook_work.o \ lapacke_dsysv_work.o \ lapacke_dsysv_aa.o \ lapacke_dsysv_aa_work.o \ @@ -1096,6 +1111,8 @@ lapacke_dsysv_aa_2stage.o \ lapacke_dsysv_aa_2stage_work.o \ lapacke_dsysv_rk.o \ lapacke_dsysv_rk_work.o \ +lapacke_dsysv_rook.o \ +lapacke_dsysv_rook_work.o \ lapacke_dsysvx.o \ lapacke_dsysvx_work.o \ lapacke_dsyswapr.o \ @@ -1104,36 +1121,34 @@ lapacke_dsytrd.o \ lapacke_dsytrd_work.o \ lapacke_dsytrf.o \ lapacke_dsytrf_work.o \ -lapacke_dsytrf_rook.o \ -lapacke_dsytrf_rook_work.o \ lapacke_dsytrf_aa.o \ lapacke_dsytrf_aa_work.o \ lapacke_dsytrf_aa_2stage.o \ lapacke_dsytrf_aa_2stage_work.o \ lapacke_dsytrf_rk.o \ lapacke_dsytrf_rk_work.o \ +lapacke_dsytrf_rook.o \ +lapacke_dsytrf_rook_work.o \ lapacke_dsytri.o \ +lapacke_dsytri_work.o \ lapacke_dsytri2.o \ lapacke_dsytri2_work.o \ -lapacke_dsytri_3.o \ -lapacke_dsytri_3_work.o \ lapacke_dsytri2x.o \ lapacke_dsytri2x_work.o \ -lapacke_dsytri_work.o - -OBJ_B = \ +lapacke_dsytri_3.o \ +lapacke_dsytri_3_work.o \ lapacke_dsytrs.o \ -lapacke_dsytrs_rook.o \ +lapacke_dsytrs_work.o \ lapacke_dsytrs2.o \ lapacke_dsytrs2_work.o \ -lapacke_dsytrs_work.o \ -lapacke_dsytrs_rook_work.o \ +lapacke_dsytrs_3.o \ +lapacke_dsytrs_3_work.o \ lapacke_dsytrs_aa.o \ -lapacke_dsytrs_aa_2stage.o \ lapacke_dsytrs_aa_work.o \ +lapacke_dsytrs_aa_2stage.o \ lapacke_dsytrs_aa_2stage_work.o \ -lapacke_dsytrs_3.o \ -lapacke_dsytrs_3_work.o \ +lapacke_dsytrs_rook.o \ +lapacke_dsytrs_rook_work.o \ lapacke_dtbcon.o \ lapacke_dtbcon_work.o \ lapacke_dtbrfs.o \ @@ -1165,9 +1180,9 @@ lapacke_dtpcon_work.o \ lapacke_dtpmqrt.o \ lapacke_dtpmqrt_work.o \ lapacke_dtpqrt.o \ +lapacke_dtpqrt_work.o \ lapacke_dtpqrt2.o \ lapacke_dtpqrt2_work.o \ -lapacke_dtpqrt_work.o \ lapacke_dtprfb.o \ lapacke_dtprfb_work.o \ lapacke_dtprfs.o \ @@ -1203,16 +1218,17 @@ lapacke_dtrttf_work.o \ lapacke_dtrttp.o \ lapacke_dtrttp_work.o \ lapacke_dtzrzf.o \ -lapacke_dtzrzf_work.o \ -lapacke_nancheck.o \ +lapacke_dtzrzf_work.o + +OBJ_S = \ lapacke_sbbcsd.o \ lapacke_sbbcsd_work.o \ lapacke_sbdsdc.o \ lapacke_sbdsdc_work.o \ -lapacke_sbdsvdx.o \ -lapacke_sbdsvdx_work.o \ lapacke_sbdsqr.o \ lapacke_sbdsqr_work.o \ +lapacke_sbdsvdx.o \ +lapacke_sbdsvdx_work.o \ lapacke_sdisna.o \ lapacke_sdisna_work.o \ lapacke_sgbbrd.o \ @@ -1290,11 +1306,11 @@ lapacke_sgeqrf_work.o \ lapacke_sgeqrfp.o \ lapacke_sgeqrfp_work.o \ lapacke_sgeqrt.o \ +lapacke_sgeqrt_work.o \ lapacke_sgeqrt2.o \ lapacke_sgeqrt2_work.o \ lapacke_sgeqrt3.o \ lapacke_sgeqrt3_work.o \ -lapacke_sgeqrt_work.o \ lapacke_sgerfs.o \ lapacke_sgerfs_work.o \ lapacke_sgerqf.o \ @@ -1305,6 +1321,8 @@ lapacke_sgesv.o \ lapacke_sgesv_work.o \ lapacke_sgesvd.o \ lapacke_sgesvd_work.o \ +lapacke_sgesvdq.o \ +lapacke_sgesvdq_work.o \ lapacke_sgesvdx.o \ lapacke_sgesvdx_work.o \ lapacke_sgesvj.o \ @@ -1341,10 +1359,10 @@ lapacke_sggevx.o \ lapacke_sggevx_work.o \ lapacke_sggglm.o \ lapacke_sggglm_work.o \ -lapacke_sgghrd.o \ -lapacke_sgghrd_work.o \ lapacke_sgghd3.o \ lapacke_sgghd3_work.o \ +lapacke_sgghrd.o \ +lapacke_sgghrd_work.o \ lapacke_sgglse.o \ lapacke_sgglse_work.o \ lapacke_sggqrf.o \ @@ -1537,14 +1555,14 @@ lapacke_spttrs.o \ lapacke_spttrs_work.o \ lapacke_ssbev.o \ lapacke_ssbev_work.o \ -lapacke_ssbevd.o \ -lapacke_ssbevd_work.o \ -lapacke_ssbevx.o \ -lapacke_ssbevx_work.o \ lapacke_ssbev_2stage.o \ lapacke_ssbev_2stage_work.o \ +lapacke_ssbevd.o \ +lapacke_ssbevd_work.o \ lapacke_ssbevd_2stage.o \ lapacke_ssbevd_2stage_work.o \ +lapacke_ssbevx.o \ +lapacke_ssbevx_work.o \ lapacke_ssbevx_2stage.o \ lapacke_ssbevx_2stage_work.o \ lapacke_ssbgst.o \ @@ -1621,18 +1639,18 @@ lapacke_ssyequb.o \ lapacke_ssyequb_work.o \ lapacke_ssyev.o \ lapacke_ssyev_work.o \ -lapacke_ssyevd.o \ -lapacke_ssyevd_work.o \ -lapacke_ssyevr.o \ -lapacke_ssyevr_work.o \ -lapacke_ssyevx.o \ -lapacke_ssyevx_work.o \ lapacke_ssyev_2stage.o \ lapacke_ssyev_2stage_work.o \ +lapacke_ssyevd.o \ +lapacke_ssyevd_work.o \ lapacke_ssyevd_2stage.o \ lapacke_ssyevd_2stage_work.o \ +lapacke_ssyevr.o \ +lapacke_ssyevr_work.o \ lapacke_ssyevr_2stage.o \ lapacke_ssyevr_2stage_work.o \ +lapacke_ssyevx.o \ +lapacke_ssyevx_work.o \ lapacke_ssyevx_2stage.o \ lapacke_ssyevx_2stage_work.o \ lapacke_ssygst.o \ @@ -1648,8 +1666,6 @@ lapacke_ssygvx_work.o \ lapacke_ssyrfs.o \ lapacke_ssyrfs_work.o \ lapacke_ssysv.o \ -lapacke_ssysv_rook.o \ -lapacke_ssysv_rook_work.o \ lapacke_ssysv_work.o \ lapacke_ssysv_aa.o \ lapacke_ssysv_aa_work.o \ @@ -1657,6 +1673,8 @@ lapacke_ssysv_aa_2stage.o \ lapacke_ssysv_aa_2stage_work.o \ lapacke_ssysv_rk.o \ lapacke_ssysv_rk_work.o \ +lapacke_ssysv_rook.o \ +lapacke_ssysv_rook_work.o \ lapacke_ssysvx.o \ lapacke_ssysvx_work.o \ lapacke_ssyswapr.o \ @@ -1665,34 +1683,34 @@ lapacke_ssytrd.o \ lapacke_ssytrd_work.o \ lapacke_ssytrf.o \ lapacke_ssytrf_work.o \ -lapacke_ssytrf_rook.o \ -lapacke_ssytrf_rook_work.o \ lapacke_ssytrf_aa.o \ lapacke_ssytrf_aa_work.o \ lapacke_ssytrf_aa_2stage.o \ lapacke_ssytrf_aa_2stage_work.o \ lapacke_ssytrf_rk.o \ lapacke_ssytrf_rk_work.o \ +lapacke_ssytrf_rook.o \ +lapacke_ssytrf_rook_work.o \ lapacke_ssytri.o \ +lapacke_ssytri_work.o \ lapacke_ssytri2.o \ lapacke_ssytri2_work.o \ -lapacke_ssytri_3.o \ -lapacke_ssytri_3_work.o \ lapacke_ssytri2x.o \ lapacke_ssytri2x_work.o \ -lapacke_ssytri_work.o \ +lapacke_ssytri_3.o \ +lapacke_ssytri_3_work.o \ lapacke_ssytrs.o \ -lapacke_ssytrs_rook.o \ +lapacke_ssytrs_work.o \ lapacke_ssytrs2.o \ lapacke_ssytrs2_work.o \ -lapacke_ssytrs_work.o \ -lapacke_ssytrs_rook_work.o \ +lapacke_ssytrs_3.o \ +lapacke_ssytrs_3_work.o \ lapacke_ssytrs_aa.o \ -lapacke_ssytrs_aa_2stage.o \ lapacke_ssytrs_aa_work.o \ +lapacke_ssytrs_aa_2stage.o \ lapacke_ssytrs_aa_2stage_work.o \ -lapacke_ssytrs_3.o \ -lapacke_ssytrs_3_work.o \ +lapacke_ssytrs_rook.o \ +lapacke_ssytrs_rook_work.o \ lapacke_stbcon.o \ lapacke_stbcon_work.o \ lapacke_stbrfs.o \ @@ -1762,7 +1780,9 @@ lapacke_strttf_work.o \ lapacke_strttp.o \ lapacke_strttp_work.o \ lapacke_stzrzf.o \ -lapacke_stzrzf_work.o \ +lapacke_stzrzf_work.o + +OBJ_Z = \ lapacke_zbbcsd.o \ lapacke_zbbcsd_work.o \ lapacke_zbdsqr.o \ @@ -1846,11 +1866,11 @@ lapacke_zgeqrf_work.o \ lapacke_zgeqrfp.o \ lapacke_zgeqrfp_work.o \ lapacke_zgeqrt.o \ +lapacke_zgeqrt_work.o \ lapacke_zgeqrt2.o \ lapacke_zgeqrt2_work.o \ lapacke_zgeqrt3.o \ lapacke_zgeqrt3_work.o \ -lapacke_zgeqrt_work.o \ lapacke_zgerfs.o \ lapacke_zgerfs_work.o \ lapacke_zgerqf.o \ @@ -1861,6 +1881,8 @@ lapacke_zgesv.o \ lapacke_zgesv_work.o \ lapacke_zgesvd.o \ lapacke_zgesvd_work.o \ +lapacke_zgesvdq.o \ +lapacke_zgesvdq_work.o \ lapacke_zgesvdx.o \ lapacke_zgesvdx_work.o \ lapacke_zgesvj.o \ @@ -1897,10 +1919,10 @@ lapacke_zggevx.o \ lapacke_zggevx_work.o \ lapacke_zggglm.o \ lapacke_zggglm_work.o \ -lapacke_zgghrd.o \ -lapacke_zgghrd_work.o \ lapacke_zgghd3.o \ lapacke_zgghd3_work.o \ +lapacke_zgghrd.o \ +lapacke_zgghrd_work.o \ lapacke_zgglse.o \ lapacke_zgglse_work.o \ lapacke_zggqrf.o \ @@ -1925,14 +1947,14 @@ lapacke_zgttrs.o \ lapacke_zgttrs_work.o \ lapacke_zhbev.o \ lapacke_zhbev_work.o \ -lapacke_zhbevd.o \ -lapacke_zhbevd_work.o \ -lapacke_zhbevx.o \ -lapacke_zhbevx_work.o \ lapacke_zhbev_2stage.o \ lapacke_zhbev_2stage_work.o \ +lapacke_zhbevd.o \ +lapacke_zhbevd_work.o \ lapacke_zhbevd_2stage.o \ lapacke_zhbevd_2stage_work.o \ +lapacke_zhbevx.o \ +lapacke_zhbevx_work.o \ lapacke_zhbevx_2stage.o \ lapacke_zhbevx_2stage_work.o \ lapacke_zhbgst.o \ @@ -1953,18 +1975,18 @@ lapacke_zheequb.o \ lapacke_zheequb_work.o \ lapacke_zheev.o \ lapacke_zheev_work.o \ -lapacke_zheevd.o \ -lapacke_zheevd_work.o \ -lapacke_zheevr.o \ -lapacke_zheevr_work.o \ -lapacke_zheevx.o \ -lapacke_zheevx_work.o \ lapacke_zheev_2stage.o \ lapacke_zheev_2stage_work.o \ +lapacke_zheevd.o \ +lapacke_zheevd_work.o \ lapacke_zheevd_2stage.o \ lapacke_zheevd_2stage_work.o \ +lapacke_zheevr.o \ +lapacke_zheevr_work.o \ lapacke_zheevr_2stage.o \ lapacke_zheevr_2stage_work.o \ +lapacke_zheevx.o \ +lapacke_zheevx_work.o \ lapacke_zheevx_2stage.o \ lapacke_zheevx_2stage_work.o \ lapacke_zhegst.o \ @@ -1994,35 +2016,35 @@ lapacke_zheswapr_work.o \ lapacke_zhetrd.o \ lapacke_zhetrd_work.o \ lapacke_zhetrf.o \ -lapacke_zhetrf_rook.o \ lapacke_zhetrf_work.o \ -lapacke_zhetrf_rook_work.o \ lapacke_zhetrf_aa.o \ -lapacke_zhetrf_aa_2stage.o \ lapacke_zhetrf_aa_work.o \ +lapacke_zhetrf_aa_2stage.o \ lapacke_zhetrf_aa_2stage_work.o \ lapacke_zhetrf_rk.o \ lapacke_zhetrf_rk_work.o \ +lapacke_zhetrf_rook.o \ +lapacke_zhetrf_rook_work.o \ lapacke_zhetri.o \ +lapacke_zhetri_work.o \ lapacke_zhetri2.o \ lapacke_zhetri2_work.o \ -lapacke_zhetri_3.o \ -lapacke_zhetri_3_work.o \ lapacke_zhetri2x.o \ lapacke_zhetri2x_work.o \ -lapacke_zhetri_work.o \ +lapacke_zhetri_3.o \ +lapacke_zhetri_3_work.o \ lapacke_zhetrs.o \ -lapacke_zhetrs_rook.o \ +lapacke_zhetrs_work.o \ lapacke_zhetrs2.o \ lapacke_zhetrs2_work.o \ -lapacke_zhetrs_work.o \ -lapacke_zhetrs_rook_work.o \ +lapacke_zhetrs_3.o \ +lapacke_zhetrs_3_work.o \ lapacke_zhetrs_aa.o \ -lapacke_zhetrs_aa_2stage.o \ lapacke_zhetrs_aa_work.o \ +lapacke_zhetrs_aa_2stage.o \ lapacke_zhetrs_aa_2stage_work.o \ -lapacke_zhetrs_3.o \ -lapacke_zhetrs_3_work.o \ +lapacke_zhetrs_rook.o \ +lapacke_zhetrs_rook_work.o \ lapacke_zhfrk.o \ lapacke_zhfrk_work.o \ lapacke_zhgeqz.o \ @@ -2213,11 +2235,11 @@ lapacke_zsyconv.o \ lapacke_zsyconv_work.o \ lapacke_zsyequb.o \ lapacke_zsyequb_work.o \ +lapacke_zsyr.o \ +lapacke_zsyr_work.o \ lapacke_zsyrfs.o \ lapacke_zsyrfs_work.o \ lapacke_zsysv.o \ -lapacke_zsysv_rook.o \ -lapacke_zsysv_rook_work.o \ lapacke_zsysv_work.o \ lapacke_zsysv_aa.o \ lapacke_zsysv_aa_work.o \ @@ -2225,40 +2247,42 @@ lapacke_zsysv_aa_2stage.o \ lapacke_zsysv_aa_2stage_work.o \ lapacke_zsysv_rk.o \ lapacke_zsysv_rk_work.o \ +lapacke_zsysv_rook.o \ +lapacke_zsysv_rook_work.o \ lapacke_zsysvx.o \ lapacke_zsysvx_work.o \ lapacke_zsyswapr.o \ lapacke_zsyswapr_work.o \ lapacke_zsytrf.o \ lapacke_zsytrf_work.o \ -lapacke_zsytrf_rook.o \ -lapacke_zsytrf_rook_work.o \ lapacke_zsytrf_aa.o \ -lapacke_zsytrf_aa_2stage.o \ lapacke_zsytrf_aa_work.o \ +lapacke_zsytrf_aa_2stage.o \ lapacke_zsytrf_aa_2stage_work.o \ lapacke_zsytrf_rk.o \ lapacke_zsytrf_rk_work.o \ +lapacke_zsytrf_rook.o \ +lapacke_zsytrf_rook_work.o \ lapacke_zsytri.o \ +lapacke_zsytri_work.o \ lapacke_zsytri2.o \ lapacke_zsytri2_work.o \ -lapacke_zsytri_3.o \ -lapacke_zsytri_3_work.o \ lapacke_zsytri2x.o \ lapacke_zsytri2x_work.o \ -lapacke_zsytri_work.o \ +lapacke_zsytri_3.o \ +lapacke_zsytri_3_work.o \ lapacke_zsytrs.o \ -lapacke_zsytrs_rook.o \ +lapacke_zsytrs_work.o \ lapacke_zsytrs2.o \ lapacke_zsytrs2_work.o \ -lapacke_zsytrs_work.o \ -lapacke_zsytrs_rook_work.o \ +lapacke_zsytrs_3.o \ +lapacke_zsytrs_3_work.o \ lapacke_zsytrs_aa.o \ -lapacke_zsytrs_aa_2stage.o \ lapacke_zsytrs_aa_work.o \ +lapacke_zsytrs_aa_2stage.o \ lapacke_zsytrs_aa_2stage_work.o \ -lapacke_zsytrs_3.o \ -lapacke_zsytrs_3_work.o \ +lapacke_zsytrs_rook.o \ +lapacke_zsytrs_rook_work.o \ lapacke_ztbcon.o \ lapacke_ztbcon_work.o \ lapacke_ztbrfs.o \ @@ -2290,9 +2314,9 @@ lapacke_ztpcon_work.o \ lapacke_ztpmqrt.o \ lapacke_ztpmqrt_work.o \ lapacke_ztpqrt.o \ +lapacke_ztpqrt_work.o \ lapacke_ztpqrt2.o \ lapacke_ztpqrt2_work.o \ -lapacke_ztpqrt_work.o \ lapacke_ztprfb.o \ lapacke_ztprfb_work.o \ lapacke_ztprfs.o \ @@ -2368,12 +2392,7 @@ lapacke_zunmtr_work.o \ lapacke_zupgtr.o \ lapacke_zupgtr_work.o \ lapacke_zupmtr.o \ -lapacke_zupmtr_work.o \ -lapacke_zsyr.o \ -lapacke_csyr.o \ -lapacke_zsyr_work.o \ -lapacke_csyr_work.o \ -lapacke_ilaver.o +lapacke_zupmtr_work.o ifdef BUILD_DEPRECATED DEPRECATED = \ @@ -2452,27 +2471,29 @@ lapacke_zlagsy.o \ lapacke_zlagsy_work.o endif -all: ../../$(LAPACKELIB) +.PHONY: all +all: $(LAPACKELIB) -.PHONY: ../../$(LAPACKELIB) - -../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN) - $(ARCH) $(ARCHFLAGS) $@ $(OBJ_A) - $(ARCH) $(ARCHFLAGS) $@ $(OBJ_B) +$(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTENDED) $(MATGEN) + $(AR) $(ARFLAGS) $@ $(OBJ) + $(AR) $(ARFLAGS) $@ $(OBJ_S) + $(AR) $(ARFLAGS) $@ $(OBJ_C) + $(AR) $(ARFLAGS) $@ $(OBJ_D) + $(AR) $(ARFLAGS) $@ $(OBJ_Z) ifdef BUILD_DEPRECATED - $(ARCH) $(ARCHFLAGS) $@ $(DEPRECATED) + $(AR) $(ARFLAGS) $@ $(DEPRECATED) endif ifdef (USEXBLAS) - $(ARCH) $(ARCHFLAGS) $@ $(EXTENDED) + $(AR) $(ARFLAGS) $@ $(EXTENDED) endif ifdef LAPACKE_WITH_TMG - $(ARCH) $(ARCHFLAGS) $@ $(MATGEN) + $(AR) $(ARFLAGS) $@ $(MATGEN) endif $(RANLIB) $@ -clean: cleanobj +.PHONY: clean cleanobj cleanlib +clean: cleanobj cleanlib cleanobj: rm -f *.o - -.c.o: - $(CC) $(CFLAGS) -I../include -c -o $@ $< +cleanlib: + rm -f $(LAPACKELIB) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c index 7d371f660..41278428b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c @@ -124,7 +124,6 @@ lapack_int LAPACKE_cgejsv( int matrix_layout, char joba, char jobu, char jobv, float* rwork = NULL; lapack_complex_float* cwork = NULL; lapack_int i; - lapack_int nu, nv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgejsv", -1 ); return -1; @@ -132,8 +131,6 @@ lapack_int LAPACKE_cgejsv( int matrix_layout, char joba, char jobu, char jobv, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m; - nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n; if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { return -10; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c index 2ee891977..9d022dae6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c @@ -75,7 +75,7 @@ lapack_int LAPACKE_cgelsd( int matrix_layout, lapack_int m, lapack_int n, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c new file mode 100644 index 000000000..91458136c --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgesvdq +* Author: Intel Corporation +* Generated November 2018 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, lapack_complex_float* a, + lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu, + lapack_complex_float* v, lapack_int ldv, lapack_int* numrank) +{ + lapack_int info = 0; + lapack_int liwork = -1; + lapack_int* iwork = NULL; + lapack_int iwork_query; + lapack_int lcwork = -1; + lapack_complex_float* cwork = NULL; + lapack_complex_float cwork_query; + lapack_int lrwork = -1; + double* rwork = NULL; + double rwork_query; + lapack_int i; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + &iwork_query, liwork, &cwork_query, lcwork, + &rwork_query, lrwork ); + if( info != 0 ) { + goto exit_level_0; + } + liwork = iwork_query; + lcwork = LAPACK_C2INT(cwork_query); + lrwork = (lapack_int)rwork_query; + /* Allocate memory for work arrays */ + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + cwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lcwork ); + if( cwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork ); + if( rwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_cgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + iwork, liwork, cwork, lcwork, rwork, lrwork ); + + /* Release memory and exit */ + LAPACKE_free( iwork ); + LAPACKE_free( cwork ); + LAPACKE_free( rwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgesvdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c new file mode 100644 index 000000000..e86f76e4b --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c @@ -0,0 +1,149 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgesvdq +* Author: Intel Corporation +* Generated November 2015 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, lapack_complex_float* a, + lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu, + lapack_complex_float* v, lapack_int ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + lapack_complex_float* cwork, lapack_int lcwork, + float* rwork, lapack_int lrwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv, + numrank, iwork, &liwork, cwork, &lcwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) || + LAPACKE_lsame( jobu, 's' ) ) ? m : 1; + lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m : + (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); + lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n : + ( LAPACKE_lsame( jobv, 's' ) ? MIN(m,n) : 1); + lapack_int lda_t = MAX(1,m); + lapack_int ldu_t = MAX(1,nrows_u); + lapack_int ldv_t = MAX(1,nrows_v); + lapack_complex_float* a_t = NULL; + lapack_complex_float* u_t = NULL; + lapack_complex_float* v_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info ); + return info; + } + if( ldu < ncols_u ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info ); + return info; + } + if( ldv < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lcwork == -1 ) { + LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + cwork, &lcwork, rwork, &lrwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + u_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldu_t * MAX(1,ncols_u) ); + if( u_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + v_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + cwork, &lcwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t, + u, ldu ); + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v, + ldv ); + } + /* Release memory and exit */ + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_free( v_t ); + } +exit_level_2: + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_free( u_t ); + } +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cggesx.c b/lapack-netlib/LAPACKE/src/lapacke_cggesx.c index fc939a314..9581691c6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cggesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cggesx.c @@ -91,7 +91,7 @@ lapack_int LAPACKE_cggesx( int matrix_layout, char jobvsl, char jobvsr, if( info != 0 ) { goto exit_level_2; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_chbevd.c b/lapack-netlib/LAPACKE/src/lapacke_chbevd.c index 024cf2585..b4af255a9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chbevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chbevd.c @@ -67,7 +67,7 @@ lapack_int LAPACKE_chbevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c index 63f7d8ccb..e8e9a6830 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c @@ -67,7 +67,7 @@ lapack_int LAPACKE_chbevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c index d44f6c622..5a7331d87 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c @@ -71,7 +71,7 @@ lapack_int LAPACKE_chbgvd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c index 40224607c..f505dfab0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_cheev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork, &info ); @@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd.c index d0dea375b..75fa47915 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd.c @@ -53,7 +53,7 @@ lapack_int LAPACKE_cheevd( int matrix_layout, char jobz, char uplo, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -65,7 +65,7 @@ lapack_int LAPACKE_cheevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c index d87481abf..cb4d34a09 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c @@ -53,7 +53,7 @@ lapack_int LAPACKE_cheevd_2stage( int matrix_layout, char jobz, char uplo, lapac #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -65,7 +65,7 @@ lapack_int LAPACKE_cheevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c index cb51f9ee4..e9e6a5d1d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c @@ -71,7 +71,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_cheevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c index 81869c564..4c5f352a8 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c @@ -71,7 +71,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_cheevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); @@ -79,7 +79,8 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevr.c b/lapack-netlib/LAPACKE/src/lapacke_cheevr.c index 6fe261624..f277e7f70 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevr.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_cheevr( int matrix_layout, char jobz, char range, char uplo, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c index 5b3f5c77a..a09eac1bd 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_cheevr_2stage( int matrix_layout, char jobz, char range, char if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c index c628017c2..ff7dd3532 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, const lapack_complex_float* b, + lapack_int lda, lapack_complex_float* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c index 001863819..a29e01961 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, const lapack_complex_float* b, + lapack_int lda, lapack_complex_float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c index 2959cb0dc..98c901982 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_chpevd.c b/lapack-netlib/LAPACKE/src/lapacke_chpevd.c index 47c7bbe23..fbdb73802 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chpevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chpevd.c @@ -66,7 +66,7 @@ lapack_int LAPACKE_chpevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c b/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c index 568882ec9..587d1509a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_chpgvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c index 7f74a9789..8c4c21935 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c @@ -43,12 +43,10 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_float* a_t = NULL; + float* work_lapack = NULL; /* Check leading dimension(s) */ if( lda < n ) { info = -8; @@ -62,12 +60,23 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + } /* Transpose input matrices */ LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); - info = 0; /* LAPACK call is ok! */ + res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_1: LAPACKE_free( a_t ); exit_level_0: if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cstedc.c b/lapack-netlib/LAPACKE/src/lapacke_cstedc.c index 5be3cec70..3c0be27d5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cstedc.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cstedc.c @@ -73,7 +73,7 @@ lapack_int LAPACKE_cstedc( int matrix_layout, char compz, lapack_int n, float* d if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cstegr.c b/lapack-netlib/LAPACKE/src/lapacke_cstegr.c index 986702e62..86a0cd72d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cstegr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cstegr.c @@ -81,7 +81,7 @@ lapack_int LAPACKE_cstegr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cstemr.c b/lapack-netlib/LAPACKE/src/lapacke_cstemr.c index 9b9b84e49..51e63c675 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cstemr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cstemr.c @@ -75,7 +75,7 @@ lapack_int LAPACKE_cstemr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c index f4a0a4334..44405c993 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const lapack_complex_float* a, + lapack_int nrhs, lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c index d914c1d69..8567a07d5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const lapack_complex_float* a, + lapack_int nrhs, lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c b/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c index e2f38c87b..6bfcdc996 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c @@ -84,7 +84,7 @@ lapack_int LAPACKE_ctgsen( int matrix_layout, lapack_int ijob, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ if( ijob != 0 ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c b/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c index 9d2684e4c..fd49d6a7f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_ctprfb( int matrix_layout, char side, char trans, char direct lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb ) { - lapack_int ncols_v, nrows_v; + lapack_int ncols_v, nrows_v, ncols_a, nrows_a; lapack_int info = 0; lapack_int ldwork; lapack_int work_size; @@ -52,20 +52,33 @@ lapack_int LAPACKE_ctprfb( int matrix_layout, char side, char trans, char direct } #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { - /* Optionally check input matrices for NaNs */ + /* Optionally check input matrices for NaNs + * V is m-by-k (left, columnwise) + * or n-by-k (right, columnwise) + * or k-by-m (left, rowwise) + * or k-by-n (right, rowwise) + * T is k-by-k + * A is k-by-n (left) + * or m-by-k (right) + * B is m-by-n + */ if( LAPACKE_lsame( storev, 'C' ) ) { ncols_v = k; nrows_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; } else if( LAPACKE_lsame( storev, 'R' ) ) { ncols_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; nrows_v = k; } else { ncols_v = 0; nrows_v = 0; } - if( LAPACKE_cge_nancheck( matrix_layout, k, m, a, lda ) ) { + nrows_a = LAPACKE_lsame( side, 'L' ) ? k : + LAPACKE_lsame( side, 'R' ) ? m : 0; + ncols_a = LAPACKE_lsame( side, 'L' ) ? n : + LAPACKE_lsame( side, 'R' ) ? k : 0; + if( LAPACKE_cge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) { return -14; } if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) { @@ -80,13 +93,13 @@ lapack_int LAPACKE_ctprfb( int matrix_layout, char side, char trans, char direct } #endif if (side=='l' || side=='L') { - ldwork = k; - work_size = MAX(1,ldwork) * MAX(1,n); - } + ldwork = k; + work_size = MAX(1,ldwork) * MAX(1,n); + } else { - ldwork = m; - work_size = MAX(1,ldwork) * MAX(1,k); - } + ldwork = m; + work_size = MAX(1,ldwork) * MAX(1,k); + } /* Allocate memory for working array(s) */ work = (lapack_complex_float*) LAPACKE_malloc( sizeof(lapack_complex_float) * work_size ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c index 592c6de45..127dd8c57 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmhr( int matrix_layout, char side, char trans, if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -11; } - if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { return -10; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c index 27647954b..193d65737 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort, if( info != 0 ) { goto exit_level_1; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c index 444a07b35..d9709bf89 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c @@ -74,7 +74,6 @@ lapack_int LAPACKE_dgejsv( int matrix_layout, char joba, char jobu, char jobv, lapack_int* iwork = NULL; double* work = NULL; lapack_int i; - lapack_int nu, nv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dgejsv", -1 ); return -1; @@ -82,8 +81,6 @@ lapack_int LAPACKE_dgejsv( int matrix_layout, char joba, char jobu, char jobv, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m; - nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n; if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { return -10; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c index 6750597bb..790119596 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_dgelsd( int matrix_layout, lapack_int m, lapack_int n, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c new file mode 100644 index 000000000..7bf831f8b --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgesvdq +* Author: Intel Corporation +* Generated November 2018 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, double* a, + lapack_int lda, double* s, double* u, lapack_int ldu, + double* v, lapack_int ldv, lapack_int* numrank) +{ + lapack_int info = 0; + lapack_int liwork = -1; + lapack_int* iwork = NULL; + lapack_int iwork_query; + lapack_int lwork = -1; + double* work = NULL; + double work_query; + lapack_int lrwork = -1; + double* rwork = NULL; + double rwork_query; + lapack_int i; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + &iwork_query, liwork, &work_query, lwork, + &rwork_query, lrwork ); + if( info != 0 ) { + goto exit_level_0; + } + liwork = iwork_query; + lwork = (lapack_int)work_query; + lrwork = (lapack_int)rwork_query; + /* Allocate memory for work arrays */ + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork ); + if( rwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_dgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + iwork, liwork, work, lwork, rwork, lrwork ); + + /* Release memory and exit */ + LAPACKE_free( iwork ); + LAPACKE_free( work ); + LAPACKE_free( rwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgesvdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c new file mode 100644 index 000000000..0de92a254 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c @@ -0,0 +1,149 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgesvdq +* Author: Intel Corporation +* Generated November 2015 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, double* a, + lapack_int lda, double* s, double* u, lapack_int ldu, + double* v, lapack_int ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + double* work, lapack_int lwork, + double* rwork, lapack_int lrwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv, + numrank, iwork, &liwork, work, &lwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) || + LAPACKE_lsame( jobu, 's' ) ) ? m : 1; + lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m : + (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); + lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n : + ( LAPACKE_lsame( jobv, 's' ) ? MIN(m,n) : 1); + lapack_int lda_t = MAX(1,m); + lapack_int ldu_t = MAX(1,nrows_u); + lapack_int ldv_t = MAX(1,nrows_v); + double* a_t = NULL; + double* u_t = NULL; + double* v_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info ); + return info; + } + if( ldu < ncols_u ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info ); + return info; + } + if( ldv < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + work, &lwork, rwork, &lrwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + u_t = (double*) + LAPACKE_malloc( sizeof(double) * ldu_t * MAX(1,ncols_u) ); + if( u_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + v_t = (double*) + LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + work, &lwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t, + u, ldu ); + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v, + ldv ); + } + /* Release memory and exit */ + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_free( v_t ); + } +exit_level_2: + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_free( u_t ); + } +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dggesx.c b/lapack-netlib/LAPACKE/src/lapacke_dggesx.c index 36addda74..91eb7bf8c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dggesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dggesx.c @@ -82,7 +82,7 @@ lapack_int LAPACKE_dggesx( int matrix_layout, char jobvsl, char jobvsr, if( info != 0 ) { goto exit_level_1; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c index 2d570ce42..5b2a6c535 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c @@ -42,12 +42,10 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); double* a_t = NULL; + double* work_lapack = NULL; /* Check leading dimension(s) */ if( lda < n ) { info = -8; @@ -60,12 +58,23 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + } /* Transpose input matrices */ LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); - info = 0; /* LAPACK call is ok! */ + res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_1: LAPACKE_free( a_t ); exit_level_0: if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormhr.c b/lapack-netlib/LAPACKE/src/lapacke_dormhr.c index de4355a74..4b9526f14 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormhr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormhr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_dormhr( int matrix_layout, char side, char trans, if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -11; } - if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { return -10; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c b/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c index 4ecd1b522..3a9abbbe1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c @@ -62,7 +62,7 @@ lapack_int LAPACKE_dsbevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c index b0ccc0b1e..4d42b6208 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c @@ -62,7 +62,7 @@ lapack_int LAPACKE_dsbevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c index 36f912ee5..cab2a64bb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c @@ -67,7 +67,7 @@ lapack_int LAPACKE_dsbgvd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dspevd.c b/lapack-netlib/LAPACKE/src/lapacke_dspevd.c index 3b6b25d5e..c7d93b6b3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dspevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dspevd.c @@ -61,7 +61,7 @@ lapack_int LAPACKE_dspevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c b/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c index 8ca478ed1..b49ce95ec 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c @@ -66,7 +66,7 @@ lapack_int LAPACKE_dspgvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstedc.c b/lapack-netlib/LAPACKE/src/lapacke_dstedc.c index 4f88a04c4..16e308450 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dstedc.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dstedc.c @@ -69,7 +69,7 @@ lapack_int LAPACKE_dstedc( int matrix_layout, char compz, lapack_int n, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstegr.c b/lapack-netlib/LAPACKE/src/lapacke_dstegr.c index 9191f0a9f..7e4f9d694 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dstegr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dstegr.c @@ -81,7 +81,7 @@ lapack_int LAPACKE_dstegr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstemr.c b/lapack-netlib/LAPACKE/src/lapacke_dstemr.c index 8dc2bd237..1a3b0ac7b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dstemr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dstemr.c @@ -75,7 +75,7 @@ lapack_int LAPACKE_dstemr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstevd.c b/lapack-netlib/LAPACKE/src/lapacke_dstevd.c index e824a164b..251a2ae2e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dstevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dstevd.c @@ -64,7 +64,7 @@ lapack_int LAPACKE_dstevd( int matrix_layout, char jobz, lapack_int n, double* d if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstevr.c b/lapack-netlib/LAPACKE/src/lapacke_dstevr.c index fd53e0ac0..d49e0ff1c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dstevr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dstevr.c @@ -81,7 +81,7 @@ lapack_int LAPACKE_dstevr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c index 9dc67f022..5a416ff45 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c @@ -65,14 +65,14 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_dsyev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } /* Transpose output matrices */ - LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c index 870148b31..d6772ea01 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c @@ -50,7 +50,7 @@ lapack_int LAPACKE_dsyevd( int matrix_layout, char jobz, char uplo, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -61,7 +61,7 @@ lapack_int LAPACKE_dsyevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c index a5507394c..e866451a5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c @@ -50,7 +50,7 @@ lapack_int LAPACKE_dsyevd_2stage( int matrix_layout, char jobz, char uplo, lapac #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -61,7 +61,7 @@ lapack_int LAPACKE_dsyevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c index 1d06250d1..90d8ce8dc 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c @@ -68,7 +68,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_dsyevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork, &liwork, &info ); @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c index 925912619..fff476445 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c @@ -68,7 +68,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_dsyevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork, &liwork, &info ); @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c index bae72f6c3..290ae0bd4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_dsyevr( int matrix_layout, char jobz, char range, char uplo, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c index dad20209e..7ee7dbc0b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_dsyevr_2stage( int matrix_layout, char jobz, char range, char if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c index 907ad50bd..51f333359 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c @@ -66,7 +66,7 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c index 46c90190f..4d73ef3c1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const double* a, lapack_int lda, + lapack_int nrhs, double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c index c937c39c5..caffa5b4b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const double* a, + lapack_int nrhs, double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c b/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c index 2cb7fce4b..baa63abe7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c @@ -81,7 +81,7 @@ lapack_int LAPACKE_dtgsen( int matrix_layout, lapack_int ijob, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ if( ijob != 0 ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c b/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c index 5191f79bb..11031b9bb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_dtprfb( int matrix_layout, char side, char trans, char direct lapack_int ldv, const double* t, lapack_int ldt, double* a, lapack_int lda, double* b, lapack_int ldb ) { - lapack_int ncols_v, nrows_v; + lapack_int ncols_v, nrows_v, ncols_a, nrows_a; lapack_int info = 0; lapack_int ldwork; lapack_int work_size; @@ -50,20 +50,33 @@ lapack_int LAPACKE_dtprfb( int matrix_layout, char side, char trans, char direct } #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { - /* Optionally check input matrices for NaNs */ + /* Optionally check input matrices for NaNs + * V is m-by-k (left, columnwise) + * or n-by-k (right, columnwise) + * or k-by-m (left, rowwise) + * or k-by-n (right, rowwise) + * T is k-by-k + * A is k-by-n (left) + * or m-by-k (right) + * B is m-by-n + */ if( LAPACKE_lsame( storev, 'C' ) ) { ncols_v = k; nrows_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; } else if( LAPACKE_lsame( storev, 'R' ) ) { ncols_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; nrows_v = k; } else { ncols_v = 0; nrows_v = 0; } - if( LAPACKE_dge_nancheck( matrix_layout, k, m, a, lda ) ) { + nrows_a = LAPACKE_lsame( side, 'L' ) ? k : + LAPACKE_lsame( side, 'R' ) ? m : 0; + ncols_a = LAPACKE_lsame( side, 'L' ) ? n : + LAPACKE_lsame( side, 'R' ) ? k : 0; + if( LAPACKE_dge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) { return -14; } if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) { @@ -78,16 +91,16 @@ lapack_int LAPACKE_dtprfb( int matrix_layout, char side, char trans, char direct } #endif if (side=='l' || side=='L') { - ldwork = k; - work_size = MAX(1,ldwork) * MAX(1,n); - } + ldwork = k; + work_size = MAX(1,ldwork) * MAX(1,n); + } else { - ldwork = m; - work_size = MAX(1,ldwork) * MAX(1,k); - } + ldwork = m; + work_size = MAX(1,ldwork) * MAX(1,k); + } /* Allocate memory for working array(s) */ work = (double*) - LAPACKE_malloc( sizeof(double) * work_size ); + LAPACKE_malloc( sizeof(double) * work_size ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c index 521bc2701..67932fd98 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_dtrsen( int matrix_layout, char job, char compq, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ if( LAPACKE_lsame( job, 'b' ) || LAPACKE_lsame( job, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c index 91cfc4fa5..0bc14b33e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort, if( info != 0 ) { goto exit_level_1; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c index aa0eeb746..0703e902f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c @@ -74,7 +74,6 @@ lapack_int LAPACKE_sgejsv( int matrix_layout, char joba, char jobu, char jobv, lapack_int* iwork = NULL; float* work = NULL; lapack_int i; - lapack_int nu, nv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_sgejsv", -1 ); return -1; @@ -82,8 +81,6 @@ lapack_int LAPACKE_sgejsv( int matrix_layout, char joba, char jobu, char jobv, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m; - nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n; if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { return -10; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c index fc42b1eec..9d00ded10 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_sgelsd( int matrix_layout, lapack_int m, lapack_int n, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c new file mode 100644 index 000000000..5ff543d10 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgesvdq +* Author: Intel Corporation +* Generated November 2018 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, float* a, + lapack_int lda, float* s, float* u, lapack_int ldu, + float* v, lapack_int ldv, lapack_int* numrank) +{ + lapack_int info = 0; + lapack_int liwork = -1; + lapack_int* iwork = NULL; + lapack_int iwork_query; + lapack_int lwork = -1; + float* work = NULL; + float work_query; + lapack_int lrwork = -1; + float* rwork = NULL; + float rwork_query; + lapack_int i; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + &iwork_query, liwork, &work_query, lwork, + &rwork_query, lrwork ); + if( info != 0 ) { + goto exit_level_0; + } + liwork = iwork_query; + lwork = (lapack_int)work_query; + lrwork = (lapack_int)rwork_query; + /* Allocate memory for work arrays */ + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + rwork = (float*)LAPACKE_malloc( sizeof(float) * lrwork ); + if( rwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_sgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + iwork, liwork, work, lwork, rwork, lrwork ); + + /* Release memory and exit */ + LAPACKE_free( iwork ); + LAPACKE_free( work ); + LAPACKE_free( rwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgesvdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c new file mode 100644 index 000000000..9eab982c2 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c @@ -0,0 +1,148 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgesvdq +* Author: Intel Corporation +* Generated November 2015 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, float* a, + lapack_int lda, float* s, float* u, lapack_int ldu, + float* v, lapack_int ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + float* work, lapack_int lwork, + float* rwork, lapack_int lrwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv, + numrank, iwork, &liwork, work, &lwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) || + LAPACKE_lsame( jobu, 's' ) ) ? m : 1; + lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m : + (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); + lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n : 1; + lapack_int lda_t = MAX(1,m); + lapack_int ldu_t = MAX(1,nrows_u); + lapack_int ldv_t = MAX(1,nrows_v); + float* a_t = NULL; + float* u_t = NULL; + float* v_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info ); + return info; + } + if( ldu < ncols_u ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info ); + return info; + } + if( ldv < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + work, &lwork, rwork, &lrwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + u_t = (float*) + LAPACKE_malloc( sizeof(float) * ldu_t * MAX(1,ncols_u) ); + if( u_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + v_t = (float*) + LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + work, &lwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t, + u, ldu ); + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v, + ldv ); + } + /* Release memory and exit */ + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_free( v_t ); + } +exit_level_2: + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_free( u_t ); + } +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sggesx.c b/lapack-netlib/LAPACKE/src/lapacke_sggesx.c index f0acb70a4..d552a2010 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sggesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sggesx.c @@ -82,7 +82,7 @@ lapack_int LAPACKE_sggesx( int matrix_layout, char jobvsl, char jobvsr, if( info != 0 ) { goto exit_level_1; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c index e9f84b55c..e1d4c270d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c @@ -42,12 +42,10 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); float* a_t = NULL; + float* work_lapack = NULL; /* Check leading dimension(s) */ if( lda < n ) { info = -8; @@ -60,12 +58,23 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + } /* Transpose input matrices */ LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); - info = 0; /* LAPACK call is ok! */ + res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_1: LAPACKE_free( a_t ); exit_level_0: if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormhr.c b/lapack-netlib/LAPACKE/src/lapacke_sormhr.c index a5cca2c45..fba215a19 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormhr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormhr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_sormhr( int matrix_layout, char side, char trans, if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -11; } - if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { return -10; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c b/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c index 3acdeb95d..b41e5b156 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c @@ -62,7 +62,7 @@ lapack_int LAPACKE_ssbevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c index 2eda9cde9..a76d92c71 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c @@ -62,7 +62,7 @@ lapack_int LAPACKE_ssbevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c index a6c036846..b40ccb9e5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c @@ -67,7 +67,7 @@ lapack_int LAPACKE_ssbgvd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sspevd.c b/lapack-netlib/LAPACKE/src/lapacke_sspevd.c index bd06a8ba6..9b518751b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sspevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sspevd.c @@ -61,7 +61,7 @@ lapack_int LAPACKE_sspevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c b/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c index 749abb0b1..e80e24647 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c @@ -66,7 +66,7 @@ lapack_int LAPACKE_sspgvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstedc.c b/lapack-netlib/LAPACKE/src/lapacke_sstedc.c index 157874668..f902e8c30 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sstedc.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sstedc.c @@ -69,7 +69,7 @@ lapack_int LAPACKE_sstedc( int matrix_layout, char compz, lapack_int n, float* d if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstegr.c b/lapack-netlib/LAPACKE/src/lapacke_sstegr.c index c6a73b2b4..c02372ba2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sstegr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sstegr.c @@ -81,7 +81,7 @@ lapack_int LAPACKE_sstegr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstemr.c b/lapack-netlib/LAPACKE/src/lapacke_sstemr.c index 4229819ab..65dcc9170 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sstemr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sstemr.c @@ -74,7 +74,7 @@ lapack_int LAPACKE_sstemr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstevd.c b/lapack-netlib/LAPACKE/src/lapacke_sstevd.c index 9f9e2e79e..c5db5d79d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sstevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sstevd.c @@ -64,7 +64,7 @@ lapack_int LAPACKE_sstevd( int matrix_layout, char jobz, lapack_int n, float* d, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstevr.c b/lapack-netlib/LAPACKE/src/lapacke_sstevr.c index f45c49087..4043e3090 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sstevr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sstevr.c @@ -81,7 +81,7 @@ lapack_int LAPACKE_sstevr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c index fb8c8971b..6a2f8fce3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c @@ -65,14 +65,14 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_ssyev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } /* Transpose output matrices */ - LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c index 1995e7950..f5924bd94 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c @@ -50,7 +50,7 @@ lapack_int LAPACKE_ssyevd( int matrix_layout, char jobz, char uplo, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -61,7 +61,7 @@ lapack_int LAPACKE_ssyevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c index 6d6785acc..40ef1bcc2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c @@ -50,7 +50,7 @@ lapack_int LAPACKE_ssyevd_2stage( int matrix_layout, char jobz, char uplo, lapac #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -61,7 +61,7 @@ lapack_int LAPACKE_ssyevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c index 5942a9abb..9394f822f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c @@ -68,7 +68,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_ssyevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork, &liwork, &info ); @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c index 7b2e19adc..12d9e84e6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c @@ -68,7 +68,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_ssyevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork, &liwork, &info ); @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c index d7e050143..3274f6bab 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_ssyevr( int matrix_layout, char jobz, char range, char uplo, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c index cbc3014e9..8958be31d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_ssyevr_2stage( int matrix_layout, char jobz, char range, char if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c index 2a1c62aef..5afe8d2de 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c @@ -66,7 +66,7 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c index a95a71469..19f447cd8 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const float* a, lapack_int lda, + lapack_int nrhs, float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c index cf98f443d..7d348b382 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const float* a, + lapack_int nrhs, float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_stgsen.c b/lapack-netlib/LAPACKE/src/lapacke_stgsen.c index 5464fd22b..d0250eb63 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_stgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_stgsen.c @@ -81,7 +81,7 @@ lapack_int LAPACKE_stgsen( int matrix_layout, lapack_int ijob, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ if( ijob != 0 ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_stprfb.c b/lapack-netlib/LAPACKE/src/lapacke_stprfb.c index 846d4ccb3..2ea20f08d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_stprfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_stprfb.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_stprfb( int matrix_layout, char side, char trans, char direct lapack_int ldv, const float* t, lapack_int ldt, float* a, lapack_int lda, float* b, lapack_int ldb) { - lapack_int ncols_v, nrows_v; + lapack_int ncols_v, nrows_v, ncols_a, nrows_a; lapack_int info = 0; lapack_int ldwork; lapack_int work_size; @@ -50,20 +50,33 @@ lapack_int LAPACKE_stprfb( int matrix_layout, char side, char trans, char direct } #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { - /* Optionally check input matrices for NaNs */ + /* Optionally check input matrices for NaNs + * V is m-by-k (left, columnwise) + * or n-by-k (right, columnwise) + * or k-by-m (left, rowwise) + * or k-by-n (right, rowwise) + * T is k-by-k + * A is k-by-n (left) + * or m-by-k (right) + * B is m-by-n + */ if( LAPACKE_lsame( storev, 'C' ) ) { ncols_v = k; nrows_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; } else if( LAPACKE_lsame( storev, 'R' ) ) { ncols_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; nrows_v = k; } else { ncols_v = 0; nrows_v = 0; } - if( LAPACKE_sge_nancheck( matrix_layout, k, m, a, lda ) ) { + nrows_a = LAPACKE_lsame( side, 'L' ) ? k : + LAPACKE_lsame( side, 'R' ) ? m : 0; + ncols_a = LAPACKE_lsame( side, 'L' ) ? n : + LAPACKE_lsame( side, 'R' ) ? k : 0; + if( LAPACKE_sge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) { return -14; } if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) { @@ -78,14 +91,14 @@ lapack_int LAPACKE_stprfb( int matrix_layout, char side, char trans, char direct } #endif if (side=='l' || side=='L') { - ldwork = k; - work_size = MAX(1,ldwork) * MAX(1,n); - } + ldwork = k; + work_size = MAX(1,ldwork) * MAX(1,n); + } else { - ldwork = m; - work_size = MAX(1,ldwork) * MAX(1,k); - } - /* Allocate memory for working array(s) */ + ldwork = m; + work_size = MAX(1,ldwork) * MAX(1,k); + } + /* Allocate memory for working array(s) */ work = (float*) LAPACKE_malloc( sizeof(float) * work_size ); if( work == NULL ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsen.c b/lapack-netlib/LAPACKE/src/lapacke_strsen.c index efba91af8..0ec3ee907 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strsen.c @@ -69,7 +69,7 @@ lapack_int LAPACKE_strsen( int matrix_layout, char job, char compq, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ if( LAPACKE_lsame( job, 'b' ) || LAPACKE_lsame( job, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c index f3b5110a7..153efb371 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c @@ -124,7 +124,6 @@ lapack_int LAPACKE_zgejsv( int matrix_layout, char joba, char jobu, char jobv, double* rwork = NULL; lapack_complex_double* cwork = NULL; lapack_int i; - lapack_int nu, nv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgejsv", -1 ); return -1; @@ -132,8 +131,6 @@ lapack_int LAPACKE_zgejsv( int matrix_layout, char joba, char jobu, char jobv, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m; - nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n; if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { return -10; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c index 6d111c69f..eca145090 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c @@ -75,7 +75,7 @@ lapack_int LAPACKE_zgelsd( int matrix_layout, lapack_int m, lapack_int n, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c new file mode 100644 index 000000000..f58a5c4e9 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgesvdq +* Author: Intel Corporation +* Generated November 2018 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, lapack_complex_double* a, + lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu, + lapack_complex_double* v, lapack_int ldv, lapack_int* numrank) +{ + lapack_int info = 0; + lapack_int liwork = -1; + lapack_int* iwork = NULL; + lapack_int iwork_query; + lapack_int lcwork = -1; + lapack_complex_double* cwork = NULL; + lapack_complex_double cwork_query; + lapack_int lrwork = -1; + double* rwork = NULL; + double rwork_query; + lapack_int i; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + &iwork_query, liwork, &cwork_query, lcwork, + &rwork_query, lrwork ); + if( info != 0 ) { + goto exit_level_0; + } + liwork = iwork_query; + lcwork = LAPACK_C2INT(cwork_query); + lrwork = (lapack_int)rwork_query; + /* Allocate memory for work arrays */ + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + cwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lcwork ); + if( cwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork ); + if( rwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_zgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv, + m, n, a, lda, s, u, ldu, v, ldv, numrank, + iwork, liwork, cwork, lcwork, rwork, lrwork ); + + /* Release memory and exit */ + LAPACKE_free( iwork ); + LAPACKE_free( cwork ); + LAPACKE_free( rwork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgesvdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c new file mode 100644 index 000000000..5824de4e0 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c @@ -0,0 +1,149 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgesvdq +* Author: Intel Corporation +* Generated November 2015 +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp, + char jobr, char jobu, char jobv, + lapack_int m, lapack_int n, lapack_complex_double* a, + lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu, + lapack_complex_double* v, lapack_int ldv, lapack_int* numrank, + lapack_int* iwork, lapack_int liwork, + lapack_complex_double* cwork, lapack_int lcwork, + double* rwork, lapack_int lrwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv, + numrank, iwork, &liwork, cwork, &lcwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) || + LAPACKE_lsame( jobu, 's' ) ) ? m : 1; + lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m : + (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); + lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n : + ( LAPACKE_lsame( jobv, 's' ) ? MIN(m,n) : 1); + lapack_int lda_t = MAX(1,m); + lapack_int ldu_t = MAX(1,nrows_u); + lapack_int ldv_t = MAX(1,nrows_v); + lapack_complex_double* a_t = NULL; + lapack_complex_double* u_t = NULL; + lapack_complex_double* v_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info ); + return info; + } + if( ldu < ncols_u ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info ); + return info; + } + if( ldv < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lcwork == -1 ) { + LAPACK_zgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + cwork, &lcwork, rwork, &lrwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + u_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldu_t * MAX(1,ncols_u) ); + if( u_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + v_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t, + s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork, + cwork, &lcwork, rwork, &lrwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t, + u, ldu ); + } + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v, + ldv ); + } + /* Release memory and exit */ + if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) { + LAPACKE_free( v_t ); + } +exit_level_2: + if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) { + LAPACKE_free( u_t ); + } +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zggesx.c b/lapack-netlib/LAPACKE/src/lapacke_zggesx.c index 6b4d27045..53e086753 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zggesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zggesx.c @@ -91,7 +91,7 @@ lapack_int LAPACKE_zggesx( int matrix_layout, char jobvsl, char jobvsr, if( info != 0 ) { goto exit_level_2; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c b/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c index 95c6d3a54..ac9467496 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c @@ -67,7 +67,7 @@ lapack_int LAPACKE_zhbevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c index eca867b28..9b6005b2d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c @@ -67,7 +67,7 @@ lapack_int LAPACKE_zhbevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c index 91bfc0a73..76c3bac3a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c @@ -71,7 +71,7 @@ lapack_int LAPACKE_zhbgvd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c index 32b4a76f0..ce278b272 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_zheev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork, &info ); @@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd.c index 4b1afb95c..1305ebfb3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd.c @@ -53,7 +53,7 @@ lapack_int LAPACKE_zheevd( int matrix_layout, char jobz, char uplo, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -65,7 +65,7 @@ lapack_int LAPACKE_zheevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c index 9016da54c..63f139435 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c @@ -53,7 +53,7 @@ lapack_int LAPACKE_zheevd_2stage( int matrix_layout, char jobz, char uplo, lapac #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } @@ -65,7 +65,7 @@ lapack_int LAPACKE_zheevd_2stage( int matrix_layout, char jobz, char uplo, lapac if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c index d4b648ee1..bf2e2c828 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c @@ -71,7 +71,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_zheevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c index 9672e6a22..f09cfe49d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c @@ -71,7 +71,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo, goto exit_level_0; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_layout, n, n, a, lda, a_t, lda_t ); + LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ LAPACK_zheevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevr.c b/lapack-netlib/LAPACKE/src/lapacke_zheevr.c index 52e7a5bee..0d26dc2f9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevr.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_zheevr( int matrix_layout, char jobz, char range, char uplo, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c index faf949aef..6fa69c44b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_zheevr_2stage( int matrix_layout, char jobz, char range, char if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c index aa2d84d84..8c4a5c374 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, const lapack_complex_double* b, + lapack_int lda, lapack_complex_double* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c index f77894204..62fce1f27 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, const lapack_complex_double* b, + lapack_int lda, lapack_complex_double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c index 81c3d29b4..1242a0eda 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c b/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c index 948bb9c10..a470ca3bb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c @@ -66,7 +66,7 @@ lapack_int LAPACKE_zhpevd( int matrix_layout, char jobz, char uplo, lapack_int n if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c index be18d3313..91fa26443 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c @@ -70,7 +70,7 @@ lapack_int LAPACKE_zhpgvd( int matrix_layout, lapack_int itype, char jobz, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c index 0d8bcf550..e62f8a4e3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c @@ -43,12 +43,10 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_double* a_t = NULL; + double* work_lapack = NULL; /* Check leading dimension(s) */ if( lda < n ) { info = -8; @@ -62,12 +60,23 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + } /* Transpose input matrices */ LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work ); - info = 0; /* LAPACK call is ok! */ + res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_1: LAPACKE_free( a_t ); exit_level_0: if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zstedc.c b/lapack-netlib/LAPACKE/src/lapacke_zstedc.c index 1bd7274c1..665c4414f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zstedc.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zstedc.c @@ -74,7 +74,7 @@ lapack_int LAPACKE_zstedc( int matrix_layout, char compz, lapack_int n, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lrwork = (lapack_int)rwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zstegr.c b/lapack-netlib/LAPACKE/src/lapacke_zstegr.c index 2a65dcc4d..07b5ce81d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zstegr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zstegr.c @@ -82,7 +82,7 @@ lapack_int LAPACKE_zstegr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zstemr.c b/lapack-netlib/LAPACKE/src/lapacke_zstemr.c index c1144488e..d1d1d5692 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zstemr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zstemr.c @@ -75,7 +75,7 @@ lapack_int LAPACKE_zstemr( int matrix_layout, char jobz, char range, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c index 3c85f9796..7442702aa 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, const lapack_complex_double* a, + lapack_int nrhs, lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c index cdc97fa02..ec05ce6d5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, - const lapack_complex_double* a, lapack_int lda, + lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c b/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c index 60f48ba8f..f6f58becd 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c @@ -84,7 +84,7 @@ lapack_int LAPACKE_ztgsen( int matrix_layout, lapack_int ijob, if( info != 0 ) { goto exit_level_0; } - liwork = (lapack_int)iwork_query; + liwork = iwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ if( ijob != 0 ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c b/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c index fce801762..7a791c0d4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_ztprfb( int matrix_layout, char side, char trans, char direct lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb) { - lapack_int ncols_v, nrows_v; + lapack_int ncols_v, nrows_v, ncols_a, nrows_a; lapack_int info = 0; lapack_int ldwork; lapack_int work_size; @@ -52,20 +52,33 @@ lapack_int LAPACKE_ztprfb( int matrix_layout, char side, char trans, char direct } #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { - /* Optionally check input matrices for NaNs */ + /* Optionally check input matrices for NaNs + * V is m-by-k (left, columnwise) + * or n-by-k (right, columnwise) + * or k-by-m (left, rowwise) + * or k-by-n (right, rowwise) + * T is k-by-k + * A is k-by-n (left) + * or m-by-k (right) + * B is m-by-n + */ if( LAPACKE_lsame( storev, 'C' ) ) { ncols_v = k; nrows_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; } else if( LAPACKE_lsame( storev, 'R' ) ) { ncols_v = LAPACKE_lsame( side, 'L' ) ? m : - ( LAPACKE_lsame( side, 'R' ) ? n : 0 ); + LAPACKE_lsame( side, 'R' ) ? n : 0; nrows_v = k; } else { ncols_v = 0; nrows_v = 0; } - if( LAPACKE_zge_nancheck( matrix_layout, k, m, a, lda ) ) { + nrows_a = LAPACKE_lsame( side, 'L' ) ? k : + LAPACKE_lsame( side, 'R' ) ? m : 0; + ncols_a = LAPACKE_lsame( side, 'L' ) ? n : + LAPACKE_lsame( side, 'R' ) ? k : 0; + if( LAPACKE_zge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) { return -14; } if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) { @@ -80,17 +93,16 @@ lapack_int LAPACKE_ztprfb( int matrix_layout, char side, char trans, char direct } #endif if (side=='l' || side=='L') { - ldwork = k; - work_size = MAX(1,ldwork) * MAX(1,n); - } + ldwork = k; + work_size = MAX(1,ldwork) * MAX(1,n); + } else { - ldwork = m; - work_size = MAX(1,ldwork) * MAX(1,k); - } - + ldwork = m; + work_size = MAX(1,ldwork) * MAX(1,k); + } /* Allocate memory for working array(s) */ work = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * work_size ); + LAPACKE_malloc( sizeof(lapack_complex_double) * work_size ); if( work == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c b/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c index 357d71184..61ed6f6f2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmhr( int matrix_layout, char side, char trans, if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -11; } - if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { return -10; } } diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index 1f639c6ea..648a8c141 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -32,7 +32,12 @@ ############################################################################## # makefile for LAPACKE, used to build lapacke binary. # -include ../../make.inc +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc + +.SUFFIXES: .c .o +.c.o: + $(CC) $(CFLAGS) -I../include -c -o $@ $< OBJ = lapacke_cgb_nancheck.o \ lapacke_cgb_trans.o \ @@ -183,15 +188,15 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_make_complex_float.o \ lapacke_make_complex_double.o +.PHONY: all all: lib +.PHONY: lib lib: $(OBJ) - $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $^ - $(RANLIB) ../../$(LAPACKELIB) + $(AR) $(ARFLAGS) $(LAPACKELIB) $^ + $(RANLIB) $(LAPACKELIB) +.PHONY: clean cleanobj clean: cleanobj cleanobj: rm -f *.o - -.c.o: - $(CC) $(CFLAGS) -I../include -c -o $@ $< diff --git a/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c index 5e51e237c..0a7e6a2e2 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c index a1f14fd69..5e058418e 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo, transr or * matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c index fc00ce2df..23174d68b 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c index 56d53c74b..d1a8aa290 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c index 97d1ab083..35c48a409 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c index 69c4cfdb4..df95f1318 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo, transr or * matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c index 214496710..0ba66f96c 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c index 2eada7c99..69d24611c 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c index 29666e273..43f33bdd2 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c index 0e5b4659f..20666c4d6 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo, transr or * matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c index eae73fa5c..c1098de70 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c index 447724b01..35ffe6522 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c index 2932d4040..4dfef0200 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c index 694e1310e..bcf331fe1 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c index a0682290b..c510b1d1a 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo, transr or * matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c index 141a796aa..450878bcf 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c index d1a88641c..2d7795166 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c index 8e1eec971..d3a06c381 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c @@ -33,7 +33,7 @@ #include "lapacke_utils.h" /* Check a matrix for NaN entries. - * Since matrix in packed format stored continiously it just required to + * Since matrix in packed format stored continuously it just required to * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout. */ diff --git a/lapack-netlib/Makefile b/lapack-netlib/Makefile index 1d7e82c34..d5e75b69e 100644 --- a/lapack-netlib/Makefile +++ b/lapack-netlib/Makefile @@ -4,89 +4,120 @@ # April 2012 # -include make.inc +TOPSRCDIR = . +include $(TOPSRCDIR)/make.inc +.PHONY: all all: lapack_install lib blas_testing lapack_testing +.PHONY: lib lib: lapacklib tmglib #lib: blaslib variants lapacklib tmglib +.PHONY: blaslib blaslib: $(MAKE) -C BLAS +.PHONY: cblaslib cblaslib: $(MAKE) -C CBLAS +.PHONY: lapacklib lapacklib: $(MAKE) -C SRC +.PHONY: lapackelib lapackelib: $(MAKE) -C LAPACKE +.PHONY: blaspplib +blaspplib: + @echo "Thank you for your interest in BLAS++, a newly developed C++ API for BLAS library" + @echo "The objective of BLAS++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc." + @echo "We are still working on integrating BLAS++ in our library. For the moment, you can download directly blas++ from https://bitbucket.org/icl/blaspp" + @echo "For support BLAS++ related question, please email: slate-user@icl.utk.edu" + +.PHONY: lapackpplib +lapackpplib: + @echo "Thank you for your interest in LAPACK++, a newly developed C++ API for LAPACK library" + @echo "The objective of LAPACK++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc." + @echo "We are still working on integrating LAPACK++ in our library. For the moment, you can download directly lapack++ from https://bitbucket.org/icl/lapackpp" + @echo "For support LAPACK++ related question, please email: slate-user@icl.utk.edu" + +.PHONY: tmglib tmglib: $(MAKE) -C TESTING/MATGEN +.PHONY: variants variants: $(MAKE) -C SRC/VARIANTS +.PHONY: lapack_install lapack_install: $(MAKE) -C INSTALL run +.PHONY: blas_testing blas_testing: blaslib $(MAKE) -C BLAS blas_testing +.PHONY: cblas_testing cblas_testing: cblaslib blaslib $(MAKE) -C CBLAS cblas_testing +.PHONY: lapack_testing lapack_testing: tmglib lapacklib blaslib $(MAKE) -C TESTING/LIN cleanexe $(MAKE) -C TESTING ./lapack_testing.py +.PHONY: variants_testing variants_testing: tmglib variants lapacklib blaslib $(MAKE) -C TESTING/LIN cleanexe - $(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/cholrl.a' + $(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/cholrl.a' $(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_cholrl.out $(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_cholrl.out $(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_cholrl.out $(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_cholrl.out $(MAKE) -C TESTING/LIN cleanexe - $(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/choltop.a' + $(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/choltop.a' $(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_choltop.out $(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_choltop.out $(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_choltop.out $(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_choltop.out $(MAKE) -C TESTING/LIN cleanexe - $(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/lucr.a' + $(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/lucr.a' $(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_lucr.out $(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_lucr.out $(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_lucr.out $(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_lucr.out $(MAKE) -C TESTING/LIN cleanexe - $(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/lull.a' + $(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/lull.a' $(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_lull.out $(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_lull.out $(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_lull.out $(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_lull.out $(MAKE) -C TESTING/LIN cleanexe - $(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/lurec.a' + $(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/lurec.a' $(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_lurec.out $(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_lurec.out $(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_lurec.out $(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_lurec.out $(MAKE) -C TESTING/LIN cleanexe - $(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/qrll.a' + $(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/qrll.a' $(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_qrll.out $(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_qrll.out $(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_qrll.out $(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_qrll.out +.PHONY: cblas_example cblas_example: cblaslib blaslib $(MAKE) -C CBLAS cblas_example +.PHONY: lapacke_example lapacke_example: lapackelib lapacklib blaslib $(MAKE) -C LAPACKE lapacke_example +.PHONY: html html: @echo "LAPACK HTML PAGES GENERATION with Doxygen" doxygen DOCS/Doxyfile @@ -96,6 +127,7 @@ html: @echo "Online version available at http://www.netlib.org/lapack/explore-html/" @echo "==================" +.PHONY: man man: @echo "LAPACK MAN PAGES GENERATION with Doxygen" doxygen DOCS/Doxyfile_man @@ -105,6 +137,7 @@ man: @echo "Usage: man dgetrf.f" @echo "==================" +.PHONY: clean cleanobj cleanlib cleanexe cleantest clean: $(MAKE) -C INSTALL clean $(MAKE) -C BLAS clean @@ -146,4 +179,4 @@ cleantest: $(MAKE) -C INSTALL cleantest $(MAKE) -C BLAS cleantest $(MAKE) -C CBLAS cleantest - $(MAKE) -C TESTING cleantest + $(MAKE) -C TESTING cleantest \ No newline at end of file diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md index e5ac2d9c8..f0aed6c18 100644 --- a/lapack-netlib/README.md +++ b/lapack-netlib/README.md @@ -3,6 +3,7 @@ [![Build Status](https://travis-ci.org/Reference-LAPACK/lapack.svg?branch=master)](https://travis-ci.org/Reference-LAPACK/lapack) [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/) [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack) +[![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions) * VERSION 1.0 : February 29, 1992 @@ -29,6 +30,7 @@ * VERSION 3.7.0 : December 2016 * VERSION 3.7.1 : June 2017 * VERSION 3.8.0 : November 2017 +* VERSION 3.9.0 : November 2019 LAPACK is a library of Fortran subroutines for solving the most commonly occurring problems in numerical linear algebra. @@ -70,6 +72,14 @@ CBLAS, a C interface to the BLAS, and (5) LAPACKE, a C interface to LAPACK. - LAPACK includes also the CMake build. You will need to have CMake installed on your machine (CMake is available at http://www.cmake.org/). CMake will allow an easy installation on a Windows Machine. + An example CMake build is: + ```sh + mkdir build + cd build + cmake -DCMAKE_INSTALL_LIBDIR=$HOME/.local/lapack .. + cmake --build -j . --target install + ``` + That installs the LAPACK library under $HOME/.local/lapack/ - Specific information to run LAPACK under Windows is available at http://icl.cs.utk.edu/lapack-for-windows/lapack/. @@ -99,7 +109,7 @@ You can also contact directly the LAPACK team at lapack@icl.utk.edu. ## Testing LAPACK includes a thorough test suite. We recommend that, after compilation, -you run the test suite. +you run the test suite. For complete information on the LAPACK Testing please consult LAPACK Working Note 41 "Installation Guide for LAPACK". @@ -115,4 +125,3 @@ LAPACK now includes the LAPACKE package. LAPACKE is a Standard C language API for LAPACK This was born from a collaboration of the LAPACK and INTEL Math Kernel Library teams. See: http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack. - diff --git a/lapack-netlib/SRC/CMakeLists.txt b/lapack-netlib/SRC/CMakeLists.txt index 944401beb..f19bdd302 100644 --- a/lapack-netlib/SRC/CMakeLists.txt +++ b/lapack-netlib/SRC/CMakeLists.txt @@ -106,7 +106,7 @@ set(SLASRC slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slauu2.f slauum.f sopgtr.f sopmtr.f sorg2l.f sorg2r.f sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f - sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f + sorgrq.f sorgtr.f sorgtsqr.f sorm2l.f sorm2r.f sorm22.f sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f spbstf.f spbsv.f spbsvx.f @@ -148,9 +148,11 @@ set(SLASRC sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f sgelq.f slaswlq.f slamswlq.f sgemlq.f stplqt.f stplqt2.f stpmlqt.f + sorhr_col.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f - ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f) + ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f + sgesvdq.f scombssq.f) set(DSLASRC spotrs.f sgetrs.f spotrf.f sgetrf.f) @@ -233,7 +235,7 @@ set(CLASRC ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f ctrsyl.f ctrti2.f ctrtri.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f - cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f + cungrq.f cungtr.f cungtsqr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f @@ -247,9 +249,11 @@ set(CLASRC cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f cgelq.f claswlq.f clamswlq.f cgemlq.f ctplqt.f ctplqt2.f ctpmlqt.f + cunhr_col.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f - chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f) + chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f + cgesvdq.f) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -295,7 +299,7 @@ set(DLASRC dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlauu2.f dlauum.f dopgtr.f dopmtr.f dorg2l.f dorg2r.f dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f - dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f + dorgrq.f dorgtr.f dorgtsqr.f dorm2l.f dorm2r.f dorm22.f dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f dpbstf.f dpbsv.f dpbsvx.f @@ -339,9 +343,11 @@ set(DLASRC dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f dgelq.f dlaswlq.f dlamswlq.f dgemlq.f dtplqt.f dtplqt2.f dtpmlqt.f + dorhr_col.f dlaorhr_col_getrfnp.f dlaorhr_col_getrfnp2.f dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f - dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f) + dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f + dgesvdq.f dcombssq.f) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -424,7 +430,7 @@ set(ZLASRC ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f ztrsyl.f ztrti2.f ztrtri.f ztrtrs.f ztzrzf.f zung2l.f zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f - zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f + zungrq.f zungtr.f zungtsqr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f zunmtr.f zupgtr.f zupmtr.f izmax1.f dzsum1.f zstemr.f @@ -440,9 +446,11 @@ set(ZLASRC zgelqt.f zgelqt3.f zgemlqt.f zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f zgelq.f zlaswlq.f zlamswlq.f zgemlq.f + zunhr_col.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f - zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f) + zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f + zgesvdq.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f @@ -504,7 +512,7 @@ if(USE_XBLAS) endif() target_link_libraries(lapack PRIVATE ${BLAS_LIBRARIES}) -if (${CMAKE_BUILD_TYPE_UPPER} STREQUAL "COVERAGE") +if(_is_coverage_build) target_link_libraries(lapack PRIVATE gcov) add_coverage(lapack) endif() diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 1c276aff6..9f79e20e9 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -1,5 +1,3 @@ -include ../make.inc - ####################################################################### # This is the makefile to create a library for LAPACK. # The files are organized as follows: @@ -44,7 +42,7 @@ include ../make.inc # and is created at the next higher directory level. # # To remove the object files after the library is created, enter -# make clean +# make cleanobj # On some systems, you can force the source files to be recompiled by # entering (for example) # make single FRC=FRC @@ -56,6 +54,13 @@ include ../make.inc # ####################################################################### +TOPSRCDIR = .. +include $(TOPSRCDIR)/make.inc + +.SUFFIXES: .F .o +.F.o: + $(FC) $(FFLAGS) -c -o $@ $< + ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \ iparmq.o iparam2stage.o \ ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ @@ -128,7 +133,7 @@ SLASRC_O = \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ - sorgrq.o sorgtr.o sorm2l.o sorm2r.o sorm22.o \ + sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \ sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ spbstf.o spbsv.o spbsvx.o \ @@ -171,9 +176,11 @@ SLASRC_O = \ sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ sgelq.o slaswlq.o slamswlq.o sgemlq.o \ stplqt.o stplqt2.o stpmlqt.o \ + sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ - ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o + ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ + sgesvdq.o scombssq.o DSLASRC_O = spotrs.o sgetrs.o spotrf.o sgetrf.o @@ -258,7 +265,7 @@ CLASRC_O = \ ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ - cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ + cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ @@ -272,9 +279,11 @@ CLASRC_O = \ cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ cgelq.o claswlq.o clamswlq.o cgemlq.o \ ctplqt.o ctplqt2.o ctpmlqt.o \ + cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ - chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o + chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ + cgesvdq.o ifdef USEXBLAS CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ @@ -324,7 +333,7 @@ DLASRC_O = \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ - dorgrq.o dorgtr.o dorm2l.o dorm2r.o dorm22.o \ + dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \ dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ dpbstf.o dpbsv.o dpbsvx.o \ @@ -368,9 +377,11 @@ DLASRC_O = \ dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \ dtplqt.o dtplqt2.o dtpmlqt.o \ + dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ - dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o + dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ + dgesvdq.o dcombssq.o ifdef USEXBLAS DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ @@ -456,7 +467,7 @@ ZLASRC_O = \ ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ - zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ + zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmtr.o zupgtr.o \ zupmtr.o izmax1.o dzsum1.o zstemr.o \ @@ -472,9 +483,11 @@ ZLASRC_O = \ zgelqt.o zgelqt3.o zgemlqt.o \ zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \ + zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ - zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o + zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ + zgesvdq.o ifdef USEXBLAS ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ @@ -550,33 +563,29 @@ ifdef BUILD_DEPRECATED DEPRECATED = $(DEPRECSRC) endif -all: ../$(LAPACKLIB) +.PHONY: all +all: $(LAPACKLIB) -.PHONY: ../$(LAPACKLIB) - -../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) - $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) +$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ +.PHONY: single complex double complex16 single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ - $(SXLASRC) $(SCLAUX) $(ALLAUX) - $(RANLIB) ../$(LAPACKLIB) + $(AR) $(ARFLAGS) $(LAPACKLIB) $^ + $(RANLIB) $(LAPACKLIB) complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ - $(CXLASRC) $(SCLAUX) $(ALLAUX) - $(RANLIB) ../$(LAPACKLIB) + $(AR) $(ARFLAGS) $(LAPACKLIB) $^ + $(RANLIB) $(LAPACKLIB) double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ - $(DXLASRC) $(DZLAUX) $(ALLAUX) - $(RANLIB) ../$(LAPACKLIB) + $(AR) $(ARFLAGS) $(LAPACKLIB) $^ + $(RANLIB) $(LAPACKLIB) complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ - $(ZXLASRC) $(DZLAUX) $(ALLAUX) - $(RANLIB) ../$(LAPACKLIB) + $(AR) $(ARFLAGS) $(LAPACKLIB) $^ + $(RANLIB) $(LAPACKLIB) $(ALLAUX): $(FRC) $(SCLAUX): $(FRC) @@ -597,18 +606,16 @@ endif FRC: @FRC=$(FRC) -clean: +.PHONY: clean cleanobj cleanlib +clean: cleanobj cleanlib +cleanobj: rm -f *.o DEPRECATED/*.o - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< - -.F.o: - $(FORTRAN) $(OPTS) -c $< -o $@ - -slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< -dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< -sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< -dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< -cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< -zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< +cleanlib: + rm -f $(LAPACKLIB) + +slaruv.o: slaruv.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +dlaruv.o: dlaruv.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +sla_wwaddw.o: sla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +dla_wwaddw.o: dla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +cla_wwaddw.o: cla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +zla_wwaddw.o: zla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 9f1410755..25d8ee175 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -1,5 +1,3 @@ -include ../../make.inc - ####################################################################### # This is the makefile to create a the variants libraries for LAPACK. # The files are organized as follows: @@ -17,6 +15,9 @@ include ../../make.inc # 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 ####################################################################### +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc + CHOLRL = cholesky/RL/cpotrf.o cholesky/RL/dpotrf.o cholesky/RL/spotrf.o cholesky/RL/zpotrf.o CHOLTOP = cholesky/TOP/cpotrf.o cholesky/TOP/dpotrf.o cholesky/TOP/spotrf.o cholesky/TOP/zpotrf.o @@ -30,37 +31,36 @@ LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o +.PHONY: all all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a cholrl.a: $(CHOLRL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ choltop.a: $(CHOLTOP) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lucr.a: $(LUCR) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lull.a: $(LULL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lurec.a: $(LUREC) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ qrll.a: $(QRLL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ +.PHONY: clean cleanobj cleanlib clean: cleanobj cleanlib cleanobj: rm -f $(CHOLRL) $(CHOLTOP) $(LUCR) $(LULL) $(LUREC) $(QRLL) cleanlib: rm -f *.a - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< diff --git a/lapack-netlib/SRC/VARIANTS/README b/lapack-netlib/SRC/VARIANTS/README index 4d301cc6e..ef7626deb 100644 --- a/lapack-netlib/SRC/VARIANTS/README +++ b/lapack-netlib/SRC/VARIANTS/README @@ -34,7 +34,7 @@ References:For a more detailed description please refer to ========= These variants are compiled by default in the build process but they are not tested by default. -The build process creates one new library per variants in the four arithmetics (single real/double real/single complex/double complex). +The build process creates one new library per variants in the four arithmetic (single real/double real/single complex/double complex). The libraries are in the SRC/VARIANTS directory. Corresponding libraries created in SRC/VARIANTS: @@ -64,16 +64,16 @@ You should then see the following files in the TESTING directory: = LINKING YOUR PROGRAM = ======================== -You just need to add the variants methods library in your linking sequence before your lapack libary. +You just need to add the variants methods library in your linking sequence before your lapack library. Here is a quick example for LU Default using LU Right Looking version: - $(FORTRAN) -c myprog.f - $(FORTRAN) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) -c myprog.f + $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB) Using LU Left Looking version: - $(FORTRAN) -c myprog.f - $(FORTRAN) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) -c myprog.f + $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB) =========== = SUPPORT = diff --git a/lapack-netlib/SRC/cgbrfsx.f b/lapack-netlib/SRC/cgbrfsx.f index 041b6a1b6..c23608afb 100644 --- a/lapack-netlib/SRC/cgbrfsx.f +++ b/lapack-netlib/SRC/cgbrfsx.f @@ -75,7 +75,7 @@ *> Specifies the form of the system of equations: *> = 'N': A * X = B (No transpose) *> = 'T': A**T * X = B (Transpose) -*> = 'C': A**H * X = B (Conjugate transpose = Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) *> \endverbatim *> *> \param[in] EQUED @@ -308,7 +308,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -344,14 +344,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -359,9 +359,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/cgbsvxx.f b/lapack-netlib/SRC/cgbsvxx.f index 2e113f99c..9f2bbbc1c 100644 --- a/lapack-netlib/SRC/cgbsvxx.f +++ b/lapack-netlib/SRC/cgbsvxx.f @@ -431,7 +431,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -467,14 +467,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -482,9 +482,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/cgebak.f b/lapack-netlib/SRC/cgebak.f index 63c73bfa7..9b6402622 100644 --- a/lapack-netlib/SRC/cgebak.f +++ b/lapack-netlib/SRC/cgebak.f @@ -48,10 +48,10 @@ *> \verbatim *> JOB is CHARACTER*1 *> Specifies the type of backward transformation required: -*> = 'N', do nothing, return immediately; -*> = 'P', do backward transformation for permutation only; -*> = 'S', do backward transformation for scaling only; -*> = 'B', do backward transformations for both permutation and +*> = 'N': do nothing, return immediately; +*> = 'P': do backward transformation for permutation only; +*> = 'S': do backward transformation for scaling only; +*> = 'B': do backward transformations for both permutation and *> scaling. *> JOB must be the same as the argument JOB supplied to CGEBAL. *> \endverbatim diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f index bdd75e4f1..f07d9b755 100644 --- a/lapack-netlib/SRC/cgeev.f +++ b/lapack-netlib/SRC/cgeev.f @@ -157,7 +157,7 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, the QR algorithm failed to compute all the *> eigenvalues, and no eigenvectors have been computed; -*> elements and i+1:N of W contain eigenvalues which have +*> elements i+1:N of W contain eigenvalues which have *> converged. *> \endverbatim * diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index a7b1c451c..350da4c40 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -80,13 +80,13 @@ *> desirable, then this option is advisable. The input matrix A *> is preprocessed with QR factorization with FULL (row and *> column) pivoting. -*> = 'G' Computation as with 'F' with an additional estimate of the +*> = 'G': Computation as with 'F' with an additional estimate of the *> condition number of B, where A=B*D. If A has heavily weighted *> rows, then using this condition number gives too pessimistic *> error bound. *> = 'A': Small singular values are not well determined by the data *> and are considered as noisy; the matrix is treated as -*> numerically rank defficient. The error in the computed +*> numerically rank deficient. The error in the computed *> singular values is bounded by f(m,n)*epsilon*||A||. *> The computed SVD A = U * S * V^* restores A up to *> f(m,n)*epsilon*||A||. @@ -117,7 +117,7 @@ *> = 'V': N columns of V are returned in the array V; Jacobi rotations *> are not explicitly accumulated. *> = 'J': N columns of V are returned in the array V, but they are -*> computed as the product of Jacobi rotations, if JOBT .EQ. 'N'. +*> computed as the product of Jacobi rotations, if JOBT = 'N'. *> = 'W': V may be used as workspace of length N*N. See the description *> of V. *> = 'N': V is not computed. @@ -131,7 +131,7 @@ *> specified range. If A .NE. 0 is scaled so that the largest singular *> value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues *> the licence to kill columns of A whose norm in c*A is less than -*> SQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN, +*> SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, *> where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E'). *> = 'N': Do not kill small columns of c*A. This option assumes that *> BLAS and QR factorizations and triangular solvers are @@ -229,7 +229,7 @@ *> If JOBU = 'F', then U contains on exit the M-by-M matrix of *> the left singular vectors, including an ONB *> of the orthogonal complement of the Range(A). -*> If JOBU = 'W' .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N), +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), *> then U is used as workspace if the procedure *> replaces A with A^*. In that case, [V] is computed *> in U as left singular vectors of A^* and then @@ -251,7 +251,7 @@ *> V is COMPLEX array, dimension ( LDV, N ) *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; -*> If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N), +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), *> then V is used as workspace if the pprocedure *> replaces A with A^*. In that case, [U] is computed *> in V as right singular vectors of A^* and then @@ -282,7 +282,7 @@ *> Length of CWORK to confirm proper allocation of workspace. *> LWORK depends on the job: *> -*> 1. If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and +*> 1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and *> 1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'): *> LWORK >= 2*N+1. This is the minimal requirement. *> ->> For optimal performance (blocked code) the optimal value @@ -298,9 +298,9 @@ *> In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ), *> N*N+LWORK(CPOCON)). -*> 2. If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'), -*> (JOBU.EQ.'N') -*> 2.1 .. no scaled condition estimate requested (JOBE.EQ.'N'): +*> 2. If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> (JOBU = 'N') +*> 2.1 .. no scaled condition estimate requested (JOBE = 'N'): *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance, *> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, @@ -318,10 +318,10 @@ *> LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ), *> N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)). *> 3. If SIGMA and the left singular vectors are needed -*> 3.1 .. no scaled condition estimate requested (JOBE.EQ.'N'): +*> 3.1 .. no scaled condition estimate requested (JOBE = 'N'): *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance: -*> if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, *> where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR. *> In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(CGEQP3), 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). @@ -329,16 +329,16 @@ *> required (JOBA='E', or 'G'). *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance: -*> if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, *> where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR. *> In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CPOCON), *> 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). *> -*> 4. If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and -*> 4.1. if JOBV.EQ.'V' +*> 4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> 4.1. if JOBV = 'V' *> the minimal requirement is LWORK >= 5*N+2*N*N. -*> 4.2. if JOBV.EQ.'J' the minimal requirement is +*> 4.2. if JOBV = 'J' the minimal requirement is *> LWORK >= 4*N+N*N. *> In both cases, the allocated CWORK can accommodate blocked runs *> of CGEQP3, CGEQRF, CGELQF, CUNMQR, CUNMLQ. @@ -357,7 +357,7 @@ *> of A. (See the description of SVA().) *> RWORK(2) = See the description of RWORK(1). *> RWORK(3) = SCONDA is an estimate for the condition number of -*> column equilibrated A. (If JOBA .EQ. 'E' or 'G') +*> column equilibrated A. (If JOBA = 'E' or 'G') *> SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). *> It is computed using SPOCON. It holds *> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA @@ -376,7 +376,7 @@ *> triangular factor in the first QR factorization. *> RWORK(5) = an estimate of the scaled condition number of the *> triangular factor in the second QR factorization. -*> The following two parameters are computed if JOBT .EQ. 'T'. +*> The following two parameters are computed if JOBT = 'T'. *> They are provided for a developer/implementer who is familiar *> with the details of the method. *> RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy @@ -457,23 +457,23 @@ *> of JOBA and JOBR. *> IWORK(2) = the number of the computed nonzero singular values *> IWORK(3) = if nonzero, a warning message: -*> If IWORK(3).EQ.1 then some of the column norms of A +*> If IWORK(3) = 1 then some of the column norms of A *> were denormalized floats. The requested high accuracy *> is not warranted by the data. -*> IWORK(4) = 1 or -1. If IWORK(4) .EQ. 1, then the procedure used A^* to +*> IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to *> do the job as specified by the JOB parameters. -*> If the call to CGEJSV is a workspace query (indicated by LWORK .EQ. -1 and -*> LRWORK .EQ. -1), then on exit IWORK(1) contains the required length of +*> If the call to CGEJSV is a workspace query (indicated by LWORK = -1 and +*> LRWORK = -1), then on exit IWORK(1) contains the required length of *> IWORK for the job parameters used in the call. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> < 0 : if INFO = -i, then the i-th argument had an illegal value. -*> = 0 : successful exit; -*> > 0 : CGEJSV did not converge in the maximal allowed number -*> of sweeps. The computed values may be inaccurate. +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: CGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. *> \endverbatim * * Authors: @@ -1336,7 +1336,7 @@ IF ( L2ABER ) THEN * Standard absolute error bound suffices. All sigma_i with * sigma_i < N*EPSLN*||A|| are flushed to zero. This is an -* agressive enforcement of lower numerical rank by introducing a +* aggressive enforcement of lower numerical rank by introducing a * backward error of the order of N*EPSLN*||A||. TEMP1 = SQRT(REAL(N))*EPSLN DO 3001 p = 2, N @@ -1348,9 +1348,9 @@ 3001 CONTINUE 3002 CONTINUE ELSE IF ( L2RANK ) THEN -* .. similarly as above, only slightly more gentle (less agressive). +* .. similarly as above, only slightly more gentle (less aggressive). * Sudden drop on the diagonal of R1 is used as the criterion for -* close-to-rank-defficient. +* close-to-rank-deficient. TEMP1 = SQRT(SFMIN) DO 3401 p = 2, N IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. @@ -1718,7 +1718,7 @@ CALL CPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1, $ CWORK(2*N+NR*NR+1),RWORK,IERR) CONDR1 = ONE / SQRT(TEMP1) -* .. here need a second oppinion on the condition number +* .. here need a second opinion on the condition number * .. then assume worst case scenario * R1 is OK for inverse <=> CONDR1 .LT. REAL(N) * more conservative <=> CONDR1 .LT. SQRT(REAL(N)) @@ -1763,7 +1763,7 @@ ELSE * * .. ill-conditioned case: second QRF with pivoting -* Note that windowed pivoting would be equaly good +* Note that windowed pivoting would be equally good * numerically, and more run-time efficient. So, in * an optimal implementation, the next call to CGEQP3 * should be replaced with eg. CALL CGEQPX (ACM TOMS #782) @@ -1821,7 +1821,7 @@ * IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 -* (this overwrittes the copy of R2, as it will not be +* (this overwrites the copy of R2, as it will not be * needed in this branch, but it does not overwritte the * Huseholder vectors of Q2.). CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) @@ -2077,7 +2077,7 @@ * * This branch deploys a preconditioned Jacobi SVD with explicitly * accumulated rotations. It is included as optional, mainly for -* experimental purposes. It does perfom well, and can also be used. +* experimental purposes. It does perform well, and can also be used. * In this implementation, this branch will be automatically activated * if the condition number sigma_max(A) / sigma_min(A) is predicted * to be greater than the overflow threshold. This is because the diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index 909162ebc..c3b2238bf 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -1,3 +1,4 @@ +*> \brief \b CGELQ * * Definition: * =========== @@ -17,7 +18,17 @@ * ============= *> *> \verbatim -*> CGELQ computes a LQ factorization of an M-by-N matrix A. +*> +*> CGELQ computes an LQ factorization of a complex M-by-N matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -138,7 +149,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -159,10 +170,10 @@ SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/cgelq2.f b/lapack-netlib/SRC/cgelq2.f index 9742d359b..3fab2c396 100644 --- a/lapack-netlib/SRC/cgelq2.f +++ b/lapack-netlib/SRC/cgelq2.f @@ -33,8 +33,16 @@ *> *> \verbatim *> -*> CGELQ2 computes an LQ factorization of a complex m by n matrix A: -*> A = L * Q. +*> CGELQ2 computes an LQ factorization of a complex m-by-n matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a n-by-n orthogonal matrix; +*> L is an lower-triangular m-by-m matrix; +*> 0 is a m-by-(n-m) zero matrix, if m < n. +*> *> \endverbatim * * Arguments: @@ -96,7 +104,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complexGEcomputational * @@ -121,10 +129,10 @@ * ===================================================================== SUBROUTINE CGELQ2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f index 216630e88..030ac0b4d 100644 --- a/lapack-netlib/SRC/cgelqf.f +++ b/lapack-netlib/SRC/cgelqf.f @@ -34,7 +34,15 @@ *> \verbatim *> *> CGELQF computes an LQ factorization of a complex M-by-N matrix A: -*> A = L * Q. +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -110,7 +118,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complexGEcomputational * @@ -135,10 +143,10 @@ * ===================================================================== SUBROUTINE CGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/cgelqt.f b/lapack-netlib/SRC/cgelqt.f index e151f10fe..f40db0b02 100644 --- a/lapack-netlib/SRC/cgelqt.f +++ b/lapack-netlib/SRC/cgelqt.f @@ -1,3 +1,4 @@ +*> \brief \b CGELQT * * Definition: * =========== diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f index f64379722..80a9a9fc7 100644 --- a/lapack-netlib/SRC/cgelqt3.f +++ b/lapack-netlib/SRC/cgelqt3.f @@ -1,3 +1,5 @@ +*> \brief \b CGELQT3 +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f index 2f44e7cfb..4e374077e 100644 --- a/lapack-netlib/SRC/cgemlq.f +++ b/lapack-netlib/SRC/cgemlq.f @@ -1,3 +1,4 @@ +*> \brief \b CGEMLQ * * Definition: * =========== @@ -143,7 +144,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/cgemlqt.f b/lapack-netlib/SRC/cgemlqt.f index e35e421b1..66b186bff 100644 --- a/lapack-netlib/SRC/cgemlqt.f +++ b/lapack-netlib/SRC/cgemlqt.f @@ -1,3 +1,5 @@ +*> \brief \b CGEMLQT +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f index a43d7be5b..54ab7aa74 100644 --- a/lapack-netlib/SRC/cgemqr.f +++ b/lapack-netlib/SRC/cgemqr.f @@ -1,3 +1,4 @@ +*> \brief \b CGEMQR * * Definition: * =========== @@ -144,7 +145,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f index a00ef45c0..e0aea88b1 100644 --- a/lapack-netlib/SRC/cgeqr.f +++ b/lapack-netlib/SRC/cgeqr.f @@ -1,3 +1,4 @@ +*> \brief \b CGEQR * * Definition: * =========== @@ -17,7 +18,18 @@ * ============= *> *> \verbatim -*> CGEQR computes a QR factorization of an M-by-N matrix A. +*> +*> CGEQR computes a QR factorization of a complex M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -138,7 +150,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -160,10 +172,10 @@ SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/cgeqr2.f b/lapack-netlib/SRC/cgeqr2.f index 1b2030b47..8cb2fa119 100644 --- a/lapack-netlib/SRC/cgeqr2.f +++ b/lapack-netlib/SRC/cgeqr2.f @@ -33,8 +33,17 @@ *> *> \verbatim *> -*> CGEQR2 computes a QR factorization of a complex m by n matrix A: -*> A = Q * R. +*> CGEQR2 computes a QR factorization of a complex m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -96,7 +105,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complexGEcomputational * @@ -121,10 +130,10 @@ * ===================================================================== SUBROUTINE CGEQR2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/cgeqr2p.f b/lapack-netlib/SRC/cgeqr2p.f index 3c64255d9..1e7b980df 100644 --- a/lapack-netlib/SRC/cgeqr2p.f +++ b/lapack-netlib/SRC/cgeqr2p.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> CGEQR2P computes a QR factorization of a complex m by n matrix A: -*> A = Q * R. The diagonal entries of R are real and nonnegative. +*> CGEQR2P computes a QR factorization of a complex m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix with nonnegative diagonal +*> entries; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -97,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complexGEcomputational * @@ -124,10 +134,10 @@ * ===================================================================== SUBROUTINE CGEQR2P( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/cgeqrf.f b/lapack-netlib/SRC/cgeqrf.f index 833384707..ff0c53f2f 100644 --- a/lapack-netlib/SRC/cgeqrf.f +++ b/lapack-netlib/SRC/cgeqrf.f @@ -34,7 +34,16 @@ *> \verbatim *> *> CGEQRF computes a QR factorization of a complex M-by-N matrix A: -*> A = Q * R. +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -111,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complexGEcomputational * @@ -136,10 +145,10 @@ * ===================================================================== SUBROUTINE CGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f index a56508b4e..9c29ac90b 100644 --- a/lapack-netlib/SRC/cgeqrfp.f +++ b/lapack-netlib/SRC/cgeqrfp.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> CGEQRFP computes a QR factorization of a complex M-by-N matrix A: -*> A = Q * R. The diagonal entries of R are real and nonnegative. +*> CGEQR2P computes a QR factorization of a complex M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix with nonnegative diagonal +*> entries; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -112,7 +122,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complexGEcomputational * @@ -139,10 +149,10 @@ * ===================================================================== SUBROUTINE CGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/cgerfsx.f b/lapack-netlib/SRC/cgerfsx.f index 7b72f9c9a..a6e24ae4f 100644 --- a/lapack-netlib/SRC/cgerfsx.f +++ b/lapack-netlib/SRC/cgerfsx.f @@ -74,7 +74,7 @@ *> Specifies the form of the system of equations: *> = 'N': A * X = B (No transpose) *> = 'T': A**T * X = B (Transpose) -*> = 'C': A**H * X = B (Conjugate transpose = Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) *> \endverbatim *> *> \param[in] EQUED @@ -283,7 +283,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -319,14 +319,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -334,9 +334,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/cgesc2.f b/lapack-netlib/SRC/cgesc2.f index c0b91107e..6f45a09a6 100644 --- a/lapack-netlib/SRC/cgesc2.f +++ b/lapack-netlib/SRC/cgesc2.f @@ -91,7 +91,7 @@ *> \verbatim *> SCALE is REAL *> On exit, SCALE contains the scale factor. SCALE is chosen -*> 0 <= SCALE <= 1 to prevent owerflow in the solution. +*> 0 <= SCALE <= 1 to prevent overflow in the solution. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cgesvdq.f b/lapack-netlib/SRC/cgesvdq.f new file mode 100644 index 000000000..77c883dde --- /dev/null +++ b/lapack-netlib/SRC/cgesvdq.f @@ -0,0 +1,1391 @@ +*> \brief CGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGESVDQ + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, +* S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, +* CWORK, LCWORK, RWORK, LRWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV +* INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK, +* INFO +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * ) +* REAL S( * ), RWORK( * ) +* INTEGER IWORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGESVDQ computes the singular value decomposition (SVD) of a complex +*> M-by-N matrix A, where M >= N. The SVD of A is written as +*> [++] [xx] [x0] [xx] +*> A = U * SIGMA * V^*, [++] = [xx] * [ox] * [xx] +*> [++] [xx] +*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal +*> matrix, and V is an N-by-N unitary matrix. The diagonal elements +*> of SIGMA are the singular values of A. The columns of U and V are the +*> left and the right singular vectors of A, respectively. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy in the computed SVD +*> = 'A' The requested accuracy corresponds to having the backward +*> error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F, +*> where EPS = SLAMCH('Epsilon'). This authorises CGESVDQ to +*> truncate the computed triangular factor in a rank revealing +*> QR factorization whenever the truncated part is below the +*> threshold of the order of EPS * ||A||_F. This is aggressive +*> truncation level. +*> = 'M' Similarly as with 'A', but the truncation is more gentle: it +*> is allowed only when there is a drop on the diagonal of the +*> triangular factor in the QR factorization. This is medium +*> truncation level. +*> = 'H' High accuracy requested. No numerical rank determination based +*> on the rank revealing QR factorization is attempted. +*> = 'E' Same as 'H', and in addition the condition number of column +*> scaled A is estimated and returned in RWORK(1). +*> N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1) +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> = 'P' The rows of A are ordered in decreasing order with respect to +*> ||A(i,:)||_\infty. This enhances numerical accuracy at the cost +*> of extra data movement. Recommended for numerical robustness. +*> = 'N' No row pivoting. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> = 'T' After the initial pivoted QR factorization, CGESVD is applied to +*> the adjoint R**H of the computed triangular factor R. This involves +*> some extra data movement (matrix transpositions). Useful for +*> experiments, research and development. +*> = 'N' The triangular factor R is given as input to CGESVD. This may be +*> preferred as it involves less data movement. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> = 'A' All M left singular vectors are computed and returned in the +*> matrix U. See the description of U. +*> = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned +*> in the matrix U. See the description of U. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular +*> vectors are computed and returned in the matrix U. +*> = 'F' The N left singular vectors are returned in factored form as the +*> product of the Q factor from the initial QR factorization and the +*> N left singular vectors of (R**H , 0)**H. If row pivoting is used, +*> then the necessary information on the row pivoting is stored in +*> IWORK(N+1:N+M-1). +*> = 'N' The left singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> = 'A', 'V' All N right singular vectors are computed and returned in +*> the matrix V. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular +*> vectors are computed and returned in the matrix V. This option is +*> allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal. +*> = 'N' The right singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array of dimensions LDA x N +*> On entry, the input matrix A. +*> On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains +*> the Householder vectors as stored by CGEQP3. If JOBU = 'F', these Householder +*> vectors together with CWORK(1:N) can be used to restore the Q factors from +*> the initial pivoted QR factorization of A. See the description of U. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER. +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array of dimension N. +*> The singular values of A, ordered so that S(i) >= S(i+1). +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is COMPLEX array, dimension +*> LDU x M if JOBU = 'A'; see the description of LDU. In this case, +*> on exit, U contains the M left singular vectors. +*> LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this +*> case, U contains the leading N or the leading NUMRANK left singular vectors. +*> LDU x N if JOBU = 'F' ; see the description of LDU. In this case U +*> contains N x N unitary matrix that can be used to form the left +*> singular vectors. +*> If JOBU = 'N', U is not referenced. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER. +*> The leading dimension of the array U. +*> If JOBU = 'A', 'S', 'U', 'R', LDU >= max(1,M). +*> If JOBU = 'F', LDU >= max(1,N). +*> Otherwise, LDU >= 1. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is COMPLEX array, dimension +*> LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' . +*> If JOBV = 'A', or 'V', V contains the N-by-N unitary matrix V**H; +*> If JOBV = 'R', V contains the first NUMRANK rows of V**H (the right +*> singular vectors, stored rowwise, of the NUMRANK largest singular values). +*> If JOBV = 'N' and JOBA = 'E', V is used as a workspace. +*> If JOBV = 'N', and JOBA.NE.'E', V is not referenced. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If JOBV = 'A', 'V', 'R', or JOBA = 'E', LDV >= max(1,N). +*> Otherwise, LDV >= 1. +*> \endverbatim +*> +*> \param[out] NUMRANK +*> \verbatim +*> NUMRANK is INTEGER +*> NUMRANK is the numerical rank first determined after the rank +*> revealing QR factorization, following the strategy specified by the +*> value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK +*> leading singular values and vectors are then requested in the call +*> of CGESVD. The final value of NUMRANK might be further reduced if +*> some singular values are computed as zeros. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (max(1, LIWORK)). +*> On exit, IWORK(1:N) contains column pivoting permutation of the +*> rank revealing QR factorization. +*> If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence +*> of row swaps used in row pivoting. These can be used to restore the +*> left singular vectors in the case JOBU = 'F'. +*> +*> If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> LIWORK(1) returns the minimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> LIWORK >= N + M - 1, if JOBP = 'P'; +*> LIWORK >= N if JOBP = 'N'. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the CWORK, IWORK, and RWORK arrays, and no error +*> message related to LCWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] CWORK +*> \verbatim +*> CWORK is COMPLEX array, dimension (max(2, LCWORK)), used as a workspace. +*> On exit, if, on entry, LCWORK.NE.-1, CWORK(1:N) contains parameters +*> needed to recover the Q factor from the QR factorization computed by +*> CGEQP3. +*> +*> If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> CWORK(1) returns the optimal LCWORK, and +*> CWORK(2) returns the minimal LCWORK. +*> \endverbatim +*> +*> \param[in,out] LCWORK +*> \verbatim +*> LCWORK is INTEGER +*> The dimension of the array CWORK. It is determined as follows: +*> Let LWQP3 = N+1, LWCON = 2*N, and let +*> LWUNQ = { MAX( N, 1 ), if JOBU = 'R', 'S', or 'U' +*> { MAX( M, 1 ), if JOBU = 'A' +*> LWSVD = MAX( 3*N, 1 ) +*> LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 3*(N/2), 1 ), LWUNLQ = MAX( N, 1 ), +*> LWQRF = MAX( N/2, 1 ), LWUNQ2 = MAX( N, 1 ) +*> Then the minimal value of LCWORK is: +*> = MAX( N + LWQP3, LWSVD ) if only the singular values are needed; +*> = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed, +*> and a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWUNQ ) if the singular values and the left +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the singular values and the left +*> singular vectors are requested, and also +*> a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD ) if the singular values and the right +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right +*> singular vectors are requested, and also +*> a scaled condition etimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWUNQ ) if the full SVD is requested with JOBV = 'R'; +*> independent of JOBR; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the full SVD is requested, +*> JOBV = 'R' and, also a scaled condition +*> estimate requested; independent of JOBR; +*> = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ) ) if the +*> full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ ) ) +*> if the full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N', and also a scaled condition number estimate +*> requested. +*> = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) ) if the +*> full SVD is requested with JOBV = 'A', 'V', and JOBR ='T' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) ) +*> if the full SVD is requested with JOBV = 'A', 'V' and +*> JOBR ='T', and also a scaled condition number estimate +*> requested. +*> Finally, LCWORK must be at least two: LCWORK = MAX( 2, LCWORK ). +*> +*> If LCWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the CWORK, IWORK, and RWORK arrays, and no error +*> message related to LCWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (max(1, LRWORK)). +*> On exit, +*> 1. If JOBA = 'E', RWORK(1) contains an estimate of the condition +*> number of column scaled A. If A = C * D where D is diagonal and C +*> has unit columns in the Euclidean norm, then, assuming full column rank, +*> N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1). +*> Otherwise, RWORK(1) = -1. +*> 2. RWORK(2) contains the number of singular values computed as +*> exact zeros in CGESVD applied to the upper triangular or trapeziodal +*> R (from the initial QR factorization). In case of early exit (no call to +*> CGESVD, such as in the case of zero matrix) RWORK(2) = -1. +*> +*> If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> RWORK(1) returns the minimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER. +*> The dimension of the array RWORK. +*> If JOBP ='P', then LRWORK >= MAX(2, M, 5*N); +*> Otherwise, LRWORK >= MAX(2, 5*N). +*> +*> If LRWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the CWORK, IWORK, and RWORK arrays, and no error +*> message related to LCWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if CBDSQR did not converge, INFO specifies how many superdiagonals +*> of an intermediate bidiagonal form B (computed in CGESVD) did not +*> converge to zero. +*> \endverbatim +* +*> \par Further Details: +* ======================== +*> +*> \verbatim +*> +*> 1. The data movement (matrix transpose) is coded using simple nested +*> DO-loops because BLAS and LAPACK do not provide corresponding subroutines. +*> Those DO-loops are easily identified in this source code - by the CONTINUE +*> statements labeled with 11**. In an optimized version of this code, the +*> nested DO loops should be replaced with calls to an optimized subroutine. +*> 2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause +*> column norm overflow. This is the minial precaution and it is left to the +*> SVD routine (CGESVD) to do its own preemptive scaling if potential over- +*> or underflows are detected. To avoid repeated scanning of the array A, +*> an optimal implementation would do all necessary scaling before calling +*> CGESVD and the scaling in CGESVD can be switched off. +*> 3. Other comments related to code optimization are given in comments in the +*> code, enlosed in [[double brackets]]. +*> \endverbatim +* +*> \par Bugs, examples and comments +* =========================== +* +*> \verbatim +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> \endverbatim +* +*> \par References +* =============== +* +*> \verbatim +*> [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for +*> Computing the SVD with High Accuracy. ACM Trans. Math. Softw. +*> 44(1): 11:1-11:30 (2017) +*> +*> SIGMA library, xGESVDQ section updated February 2016. +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2018 +* +*> \ingroup complexGEsing +* +* ===================================================================== + SUBROUTINE CGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, + $ S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, + $ CWORK, LCWORK, RWORK, LRWORK, INFO ) +* .. Scalar Arguments .. + IMPLICIT NONE + CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV + INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK, + $ INFO +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * ) + REAL S( * ), RWORK( * ) + INTEGER IWORK( * ) +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), CONE = ( 1.0E0, 0.0E0 ) ) +* .. +* .. Local Scalars .. + INTEGER IERR, NR, N1, OPTRATIO, p, q + INTEGER LWCON, LWQP3, LWRK_CGELQF, LWRK_CGESVD, LWRK_CGESVD2, + $ LWRK_CGEQP3, LWRK_CGEQRF, LWRK_CUNMLQ, LWRK_CUNMQR, + $ LWRK_CUNMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWUNQ, + $ LWUNQ2, LWUNLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2, + $ IMINWRK, RMINWRK + LOGICAL ACCLA, ACCLM, ACCLH, ASCALED, CONDA, DNTWU, DNTWV, + $ LQUERY, LSVC0, LSVEC, ROWPRM, RSVEC, RTRANS, WNTUA, + $ WNTUF, WNTUR, WNTUS, WNTVA, WNTVR + REAL BIG, EPSLN, RTMP, SCONDA, SFMIN + COMPLEX CTMP +* .. +* .. Local Arrays + COMPLEX CDUMMY(1) + REAL RDUMMY(1) +* .. +* .. External Subroutines (BLAS, LAPACK) + EXTERNAL CGELQF, CGEQP3, CGEQRF, CGESVD, CLACPY, CLAPMT, + $ CLASCL, CLASET, CLASWP, CSSCAL, SLASET, SLASCL, + $ CPOCON, CUNMLQ, CUNMQR, XERBLA +* .. +* .. External Functions (BLAS, LAPACK) + LOGICAL LSAME + INTEGER ISAMAX + REAL CLANGE, SCNRM2, SLAMCH + EXTERNAL CLANGE, LSAME, ISAMAX, SCNRM2, SLAMCH +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX, MIN, REAL, SQRT +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + WNTUS = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' ) + WNTUR = LSAME( JOBU, 'R' ) + WNTUA = LSAME( JOBU, 'A' ) + WNTUF = LSAME( JOBU, 'F' ) + LSVC0 = WNTUS .OR. WNTUR .OR. WNTUA + LSVEC = LSVC0 .OR. WNTUF + DNTWU = LSAME( JOBU, 'N' ) +* + WNTVR = LSAME( JOBV, 'R' ) + WNTVA = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' ) + RSVEC = WNTVR .OR. WNTVA + DNTWV = LSAME( JOBV, 'N' ) +* + ACCLA = LSAME( JOBA, 'A' ) + ACCLM = LSAME( JOBA, 'M' ) + CONDA = LSAME( JOBA, 'E' ) + ACCLH = LSAME( JOBA, 'H' ) .OR. CONDA +* + ROWPRM = LSAME( JOBP, 'P' ) + RTRANS = LSAME( JOBR, 'T' ) +* + IF ( ROWPRM ) THEN + IMINWRK = MAX( 1, N + M - 1 ) + RMINWRK = MAX( 2, M, 5*N ) + ELSE + IMINWRK = MAX( 1, N ) + RMINWRK = MAX( 2, 5*N ) + END IF + LQUERY = (LIWORK .EQ. -1 .OR. LCWORK .EQ. -1 .OR. LRWORK .EQ. -1) + INFO = 0 + IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN + INFO = -1 + ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = -2 + ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN + INFO = -3 + ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN + INFO = -4 + ELSE IF ( WNTUR .AND. WNTVA ) THEN + INFO = -5 + ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN + INFO = -5 + ELSE IF ( M.LT.0 ) THEN + INFO = -6 + ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN + INFO = -7 + ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN + INFO = -9 + ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR. + $ ( WNTUF .AND. LDU.LT.N ) ) THEN + INFO = -12 + ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR. + $ ( CONDA .AND. LDV.LT.N ) ) THEN + INFO = -14 + ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN + INFO = -17 + END IF +* +* + IF ( INFO .EQ. 0 ) THEN +* +* Compute workspace +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LCWORK are written with a lot of redundancy and +* can be simplified. However, this detailed form is easier for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for CGEQP3 of an M x N matrix + LWQP3 = N+1 +* .. minimal workspace length for CUNMQR to build left singular vectors + IF ( WNTUS .OR. WNTUR ) THEN + LWUNQ = MAX( N , 1 ) + ELSE IF ( WNTUA ) THEN + LWUNQ = MAX( M , 1 ) + END IF +* .. minimal workspace length for CPOCON of an N x N matrix + LWCON = 2 * N +* .. CGESVD of an N x N matrix + LWSVD = MAX( 3 * N, 1 ) + IF ( LQUERY ) THEN + CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_CGEQP3 = INT( CDUMMY(1) ) + IF ( WNTUS .OR. WNTUR ) THEN + CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQR = INT( CDUMMY(1) ) + ELSE IF ( WNTUA ) THEN + CALL CUNMQR( 'L', 'N', M, M, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQR = INT( CDUMMY(1) ) + ELSE + LWRK_CUNMQR = 0 + END IF + END IF + MINWRK = 2 + OPTWRK = 2 + IF ( .NOT. (LSVEC .OR. RSVEC )) THEN +* .. minimal and optimal sizes of the complex workspace if +* only the singular values are requested + IF ( CONDA ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + CALL CGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_CGESVD = INT( CDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, LWRK_CGESVD ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, LWRK_CGESVD ) + END IF + END IF + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the left singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD, LWUNQ ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL CGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + ELSE + CALL CGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + END IF + LWRK_CGESVD = INT( CDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, LWRK_CGESVD, + $ LWRK_CUNMQR ) + ELSE + OPTWRK = N + MAX( LWRK_CGEQP3, LWRK_CGESVD, + $ LWRK_CUNMQR ) + END IF + END IF + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the right singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL CGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + ELSE + CALL CGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + END IF + LWRK_CGESVD = INT( CDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, LWRK_CGESVD ) + ELSE + OPTWRK = N + MAX( LWRK_CGEQP3, LWRK_CGESVD ) + END IF + END IF + ELSE +* .. minimal and optimal sizes of the complex workspace if the +* full SVD is requested + IF ( RTRANS ) THEN + MINWRK = MAX( LWQP3, LWSVD, LWUNQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N x N/2 CGEQRF + LWQRF = MAX( N/2, 1 ) +* .. minimal workspace lengt for N/2 x N/2 CGESVD + LWSVD2 = MAX( 3 * (N/2), 1 ) + LWUNQ2 = MAX( N, 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, + $ N/2+LWUNQ2, LWUNQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + ELSE + MINWRK = MAX( LWQP3, LWSVD, LWUNQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N/2 x N CGELQF + LWLQF = MAX( N/2, 1 ) + LWSVD2 = MAX( 3 * (N/2), 1 ) + LWUNLQ = MAX( N , 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, + $ N/2+LWUNLQ, LWUNQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL CGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_CGESVD = INT( CDUMMY(1) ) + OPTWRK = MAX(LWRK_CGEQP3,LWRK_CGESVD,LWRK_CUNMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL CGEQRF(N,N/2,U,LDU,CDUMMY,CDUMMY,-1,IERR) + LWRK_CGEQRF = INT( CDUMMY(1) ) + CALL CGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_CGESVD2 = INT( CDUMMY(1) ) + CALL CUNMQR( 'R', 'C', N, N, N/2, U, LDU, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_CUNMQR2 = INT( CDUMMY(1) ) + OPTWRK2 = MAX( LWRK_CGEQP3, N/2+LWRK_CGEQRF, + $ N/2+LWRK_CGESVD2, N/2+LWRK_CUNMQR2 ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + ELSE + CALL CGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_CGESVD = INT( CDUMMY(1) ) + OPTWRK = MAX(LWRK_CGEQP3,LWRK_CGESVD,LWRK_CUNMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL CGELQF(N/2,N,U,LDU,CDUMMY,CDUMMY,-1,IERR) + LWRK_CGELQF = INT( CDUMMY(1) ) + CALL CGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_CGESVD2 = INT( CDUMMY(1) ) + CALL CUNMLQ( 'R', 'N', N, N, N/2, U, LDU, CDUMMY, + $ V, LDV, CDUMMY,-1,IERR ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) + OPTWRK2 = MAX( LWRK_CGEQP3, N/2+LWRK_CGELQF, + $ N/2+LWRK_CGESVD2, N/2+LWRK_CUNMLQ ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + END IF + END IF + END IF +* + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( 2, OPTWRK ) + IF ( LCWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19 +* + END IF +* + IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN + INFO = -21 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGESVDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +* +* Return optimal workspace +* + IWORK(1) = IMINWRK + CWORK(1) = OPTWRK + CWORK(2) = MINWRK + RWORK(1) = RMINWRK + RETURN + END IF +* +* Quick return if the matrix is void. +* + IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN +* .. all output is void. + RETURN + END IF +* + BIG = SLAMCH('O') + ASCALED = .FALSE. + IF ( ROWPRM ) THEN +* .. reordering the rows in decreasing sequence in the +* ell-infinity norm - this enhances numerical robustness in +* the case of differently scaled rows. + DO 1904 p = 1, M +* RWORK(p) = ABS( A(p,ICAMAX(N,A(p,1),LDA)) ) +* [[CLANGE will return NaN if an entry of the p-th row is Nan]] + RWORK(p) = CLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY ) +* .. check for NaN's and Inf's + IF ( ( RWORK(p) .NE. RWORK(p) ) .OR. + $ ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN + INFO = - 8 + CALL XERBLA( 'CGESVDQ', -INFO ) + RETURN + END IF + 1904 CONTINUE + DO 1952 p = 1, M - 1 + q = ISAMAX( M-p+1, RWORK(p), 1 ) + p - 1 + IWORK(N+p) = q + IF ( p .NE. q ) THEN + RTMP = RWORK(p) + RWORK(p) = RWORK(q) + RWORK(q) = RTMP + END IF + 1952 CONTINUE +* + IF ( RWORK(1) .EQ. ZERO ) THEN +* Quick return: A is the M x N zero matrix. + NUMRANK = 0 + CALL SLASET( 'G', N, 1, ZERO, ZERO, S, N ) + IF ( WNTUS ) CALL CLASET('G', M, N, CZERO, CONE, U, LDU) + IF ( WNTUA ) CALL CLASET('G', M, M, CZERO, CONE, U, LDU) + IF ( WNTVA ) CALL CLASET('G', N, N, CZERO, CONE, V, LDV) + IF ( WNTUF ) THEN + CALL CLASET( 'G', N, 1, CZERO, CZERO, CWORK, N ) + CALL CLASET( 'G', M, N, CZERO, CONE, U, LDU ) + END IF + DO 5001 p = 1, N + IWORK(p) = p + 5001 CONTINUE + IF ( ROWPRM ) THEN + DO 5002 p = N + 1, N + M - 1 + IWORK(p) = p - N + 5002 CONTINUE + END IF + IF ( CONDA ) RWORK(1) = -1 + RWORK(2) = -1 + RETURN + END IF +* + IF ( RWORK(1) .GT. BIG / SQRT(REAL(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL CLASCL('G',0,0,SQRT(REAL(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + CALL CLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 ) + END IF +* +* .. At this stage, preemptive scaling is done only to avoid column +* norms overflows during the QR factorization. The SVD procedure should +* have its own scaling to save the singular values from overflows and +* underflows. That depends on the SVD procedure. +* + IF ( .NOT.ROWPRM ) THEN + RTMP = CLANGE( 'M', M, N, A, LDA, RWORK ) + IF ( ( RTMP .NE. RTMP ) .OR. + $ ( (RTMP*ZERO) .NE. ZERO ) ) THEN + INFO = - 8 + CALL XERBLA( 'CGESVDQ', -INFO ) + RETURN + END IF + IF ( RTMP .GT. BIG / SQRT(REAL(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL CLASCL('G',0,0, SQRT(REAL(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + END IF +* +* .. QR factorization with column pivoting +* +* A * P = Q * [ R ] +* [ 0 ] +* + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL CGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LCWORK-N, + $ RWORK, IERR ) +* +* If the user requested accuracy level allows truncation in the +* computed upper triangular factor, the matrix R is examined and, +* if possible, replaced with its leading upper trapezoidal part. +* + EPSLN = SLAMCH('E') + SFMIN = SLAMCH('S') +* SMALL = SFMIN / EPSLN + NR = N +* + IF ( ACCLA ) THEN +* +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPS*||A||_F are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPS*||A||_F. + NR = 1 + RTMP = SQRT(REAL(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002 + NR = NR + 1 + 3001 CONTINUE + 3002 CONTINUE +* + ELSEIF ( ACCLM ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R is used as the criterion for being +* close-to-rank-deficient. The threshold is set to EPSLN=SLAMCH('E'). +* [[This can be made more flexible by replacing this hard-coded value +* with a user specified threshold.]] Also, the values that underflow +* will be truncated. + NR = 1 + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* .. RRQR not authorized to determine numerical rank except in the +* obvious case of zero pivots. +* .. inspect R for exact zeros on the diagonal; +* R(i,i)=0 => R(i:N,i:N)=0. + NR = 1 + DO 3501 p = 2, N + IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502 + NR = NR + 1 + 3501 CONTINUE + 3502 CONTINUE +* + IF ( CONDA ) THEN +* Estimate the scaled condition number of A. Use the fact that it is +* the same as the scaled condition number of R. +* .. V is used as workspace + CALL CLACPY( 'U', N, N, A, LDA, V, LDV ) +* Only the leading NR x NR submatrix of the triangular factor +* is considered. Only if NR=N will this give a reliable error +* bound. However, even for NR < N, this can be used on an +* expert level and obtain useful information in the sense of +* perturbation theory. + DO 3053 p = 1, NR + RTMP = SCNRM2( p, V(1,p), 1 ) + CALL CSSCAL( p, ONE/RTMP, V(1,p), 1 ) + 3053 CONTINUE + IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN + CALL CPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ CWORK, RWORK, IERR ) + ELSE + CALL CPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ CWORK(N+1), RWORK, IERR ) + END IF + SCONDA = ONE / SQRT(RTMP) +* For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1), +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +* See the reference [1] for more details. + END IF +* + ENDIF +* + IF ( WNTUR ) THEN + N1 = NR + ELSE IF ( WNTUS .OR. WNTUF) THEN + N1 = N + ELSE IF ( WNTUA ) THEN + N1 = M + END IF +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +*....................................................................... +* .. only the singular values are requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. compute the singular values of R**H = [A](1:NR,1:N)**H +* .. set the lower triangle of [A] to [A](1:NR,1:N)**H and +* the upper triangle of [A] to zero. + DO 1146 p = 1, MIN( N, NR ) + A(p,p) = CONJG(A(p,p)) + DO 1147 q = p + 1, N + A(q,p) = CONJG(A(p,q)) + IF ( q .LE. NR ) A(p,q) = CZERO + 1147 CONTINUE + 1146 CONTINUE +* + CALL CGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU, + $ V, LDV, CWORK, LCWORK, RWORK, INFO ) +* + ELSE +* +* .. compute the singular values of R = [A](1:NR,1:N) +* + IF ( NR .GT. 1 ) + $ CALL CLASET( 'L', NR-1,NR-1, CZERO,CZERO, A(2,1), LDA ) + CALL CGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU, + $ V, LDV, CWORK, LCWORK, RWORK, INFO ) +* + END IF +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN +*....................................................................... +* .. the singular values and the left singular vectors requested +*......................................................................."""""""" + IF ( RTRANS ) THEN +* .. apply CGESVD to R**H +* .. copy R**H into [U] and overwrite [U] with the right singular +* vectors of R + DO 1192 p = 1, NR + DO 1193 q = p, N + U(q,p) = CONJG(A(p,q)) + 1193 CONTINUE + 1192 CONTINUE + IF ( NR .GT. 1 ) + $ CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, U(1,2), LDU ) +* .. the left singular vectors not computed, the NR right singular +* vectors overwrite [U](1:NR,1:NR) as conjugate transposed. These +* will be pre-multiplied by Q to build the left singular vectors of A. + CALL CGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1119 p = 1, NR + U(p,p) = CONJG(U(p,p)) + DO 1120 q = p + 1, NR + CTMP = CONJG(U(q,p)) + U(q,p) = CONJG(U(p,q)) + U(p,q) = CTMP + 1120 CONTINUE + 1119 CONTINUE +* + ELSE +* .. apply CGESVD to R +* .. copy R into [U] and overwrite [U] with the left singular vectors + CALL CLACPY( 'U', NR, N, A, LDA, U, LDU ) + IF ( NR .GT. 1 ) + $ CALL CLASET( 'L', NR-1, NR-1, CZERO, CZERO, U(2,1), LDU ) +* .. the right singular vectors not computed, the NR left singular +* vectors overwrite [U](1:NR,1:NR) + CALL CGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* .. now [U](1:NR,1:NR) contains the NR left singular vectors of +* R. These will be pre-multiplied by Q to build the left singular +* vectors of A. + END IF +* +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN + CALL CLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL CLASET( 'A',NR,N1-NR,CZERO,CZERO,U(1,NR+1), LDU ) + CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT.WNTUF ) + $ CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LCWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* + ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN +*....................................................................... +* .. the singular values and the right singular vectors requested +*....................................................................... + IF ( RTRANS ) THEN +* .. apply CGESVD to R**H +* .. copy R**H into V and overwrite V with the left singular vectors + DO 1165 p = 1, NR + DO 1166 q = p, N + V(q,p) = CONJG(A(p,q)) + 1166 CONTINUE + 1165 CONTINUE + IF ( NR .GT. 1 ) + $ CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) +* .. the left singular vectors of R**H overwrite V, the right singular +* vectors not computed + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL CGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1121 p = 1, NR + V(p,p) = CONJG(V(p,p)) + DO 1122 q = p + 1, NR + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1122 CONTINUE + 1121 CONTINUE +* + IF ( NR .LT. N ) THEN + DO 1103 p = 1, NR + DO 1104 q = NR + 1, N + V(p,q) = CONJG(V(q,p)) + 1104 CONTINUE + 1103 CONTINUE + END IF + CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK ) + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:N,1:NR) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the QR factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL CLASET('G', N, N-NR, CZERO, CZERO, V(1,NR+1), LDV) + CALL CGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1123 p = 1, N + V(p,p) = CONJG(V(p,p)) + DO 1124 q = p + 1, N + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1124 CONTINUE + 1123 CONTINUE + CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* + ELSE +* .. aply CGESVD to R +* .. copy R into V and overwrite V with the right singular vectors + CALL CLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL CLASET( 'L', NR-1, NR-1, CZERO, CZERO, V(2,1), LDV ) +* .. the right singular vectors overwrite V, the NR left singular +* vectors stored in U(1:NR,1:NR) + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL CGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:NR,1:N) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the LQ factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL CLASET('G', N-NR, N, CZERO,CZERO, V(NR+1,1), LDV) + CALL CGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* .. now [V] contains the adjoint of the matrix of the right singular +* vectors of A. + END IF +* + ELSE +*....................................................................... +* .. FULL SVD requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. apply CGESVD to R**H [[this option is left for R&D&T]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R**H into [V] and overwrite [V] with the left singular +* vectors of R**H + DO 1168 p = 1, NR + DO 1169 q = p, N + V(q,p) = CONJG(A(p,q)) + 1169 CONTINUE + 1168 CONTINUE + IF ( NR .GT. 1 ) + $ CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) +* +* .. the left singular vectors of R**H overwrite [V], the NR right +* singular vectors of R**H stored in [U](1:NR,1:NR) as conjugate +* transposed + CALL CGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* .. assemble V + DO 1115 p = 1, NR + V(p,p) = CONJG(V(p,p)) + DO 1116 q = p + 1, NR + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1116 CONTINUE + 1115 CONTINUE + IF ( NR .LT. N ) THEN + DO 1101 p = 1, NR + DO 1102 q = NR+1, N + V(p,q) = CONJG(V(q,p)) + 1102 CONTINUE + 1101 CONTINUE + END IF + CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* + DO 1117 p = 1, NR + U(p,p) = CONJG(U(p,p)) + DO 1118 q = p + 1, NR + CTMP = CONJG(U(q,p)) + U(q,p) = CONJG(U(p,q)) + U(p,q) = CTMP + 1118 CONTINUE + 1117 CONTINUE +* + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL CLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. copy R**H into [V] and overwrite [V] with the left singular +* vectors of R**H +* [[The optimal ratio N/NR for using QRF instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'CGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO*NR .GT. N ) THEN + DO 1198 p = 1, NR + DO 1199 q = p, N + V(q,p) = CONJG(A(p,q)) + 1199 CONTINUE + 1198 CONTINUE + IF ( NR .GT. 1 ) + $ CALL CLASET('U',NR-1,NR-1, CZERO,CZERO, V(1,2),LDV) +* + CALL CLASET('A',N,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL CGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1113 p = 1, N + V(p,p) = CONJG(V(p,p)) + DO 1114 q = p + 1, N + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1114 CONTINUE + 1113 CONTINUE + CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). +* + DO 1111 p = 1, N + U(p,p) = CONJG(U(p,p)) + DO 1112 q = p + 1, N + CTMP = CONJG(U(q,p)) + U(q,p) = CONJG(U(p,q)) + U(p,q) = CTMP + 1112 CONTINUE + 1111 CONTINUE +* + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL CLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL CLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU) + CALL CLASET('A',M-N,N1-N,CZERO,CONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE +* .. copy R**H into [U] and overwrite [U] with the right +* singular vectors of R + DO 1196 p = 1, NR + DO 1197 q = p, N + U(q,NR+p) = CONJG(A(p,q)) + 1197 CONTINUE + 1196 CONTINUE + IF ( NR .GT. 1 ) + $ CALL CLASET('U',NR-1,NR-1,CZERO,CZERO,U(1,NR+2),LDU) + CALL CGEQRF( N, NR, U(1,NR+1), LDU, CWORK(N+1), + $ CWORK(N+NR+1), LCWORK-N-NR, IERR ) + DO 1143 p = 1, NR + DO 1144 q = 1, N + V(q,p) = CONJG(U(p,NR+q)) + 1144 CONTINUE + 1143 CONTINUE + CALL CLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV) + CALL CGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V,LDV, CWORK(N+NR+1),LCWORK-N-NR,RWORK, INFO ) + CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + CALL CUNMQR('R','C', N, N, NR, U(1,NR+1), LDU, + $ CWORK(N+1),V,LDV,CWORK(N+NR+1),LCWORK-N-NR,IERR) + CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL CLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF + END IF + END IF +* + ELSE +* +* .. apply CGESVD to R [[this is the recommended option]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R into [V] and overwrite V with the right singular vectors + CALL CLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL CLASET( 'L', NR-1,NR-1, CZERO,CZERO, V(2,1), LDV ) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL CGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL CLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. the requested number of the left singular vectors +* is then N1 (N or M) +* [[The optimal ratio N/NR for using LQ instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'CGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO * NR .GT. N ) THEN + CALL CLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL CLASET('L', NR-1,NR-1, CZERO,CZERO, V(2,1),LDV) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL CLASET('A', N-NR,N, CZERO,CZERO, V(NR+1,1),LDV) + CALL CGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. now [V] contains the adjoint of the matrix of the right +* singular vectors of A. The leading N left singular vectors +* are in [U](1:N,1:N) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL CLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL CLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU) + CALL CLASET( 'A',M-N,N1-N,CZERO,CONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE + CALL CLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU ) + IF ( NR .GT. 1 ) + $ CALL CLASET('L',NR-1,NR-1,CZERO,CZERO,U(NR+2,1),LDU) + CALL CGELQF( NR, N, U(NR+1,1), LDU, CWORK(N+1), + $ CWORK(N+NR+1), LCWORK-N-NR, IERR ) + CALL CLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV) + IF ( NR .GT. 1 ) + $ CALL CLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV) + CALL CGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+NR+1), LCWORK-N-NR, RWORK, INFO ) + CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + CALL CUNMLQ('R','N',N,N,NR,U(NR+1,1),LDU,CWORK(N+1), + $ V, LDV, CWORK(N+NR+1),LCWORK-N-NR,IERR) + CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL CLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF + END IF + END IF +* .. end of the "R**H or R" branch + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT. WNTUF ) + $ CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LCWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* +* ... end of the "full SVD" branch + END IF +* +* Check whether some singular values are returned as zeros, e.g. +* due to underflow, and update the numerical rank. + p = NR + DO 4001 q = p, 1, -1 + IF ( S(q) .GT. ZERO ) GO TO 4002 + NR = NR - 1 + 4001 CONTINUE + 4002 CONTINUE +* +* .. if numerical rank deficiency is detected, the truncated +* singular values are set to zero. + IF ( NR .LT. N ) CALL SLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N ) +* .. undo scaling; this may cause overflow in the largest singular +* values. + IF ( ASCALED ) + $ CALL SLASCL( 'G',0,0, ONE,SQRT(REAL(M)), NR,1, S, N, IERR ) + IF ( CONDA ) RWORK(1) = SCONDA + RWORK(2) = p - NR +* .. p-NR is the number of singular values that are computed as +* exact zeros in CGESVD() applied to the (possibly truncated) +* full row rank triangular (trapezoidal) factor of A. + NUMRANK = NR +* + RETURN +* +* End of CGESVDQ +* + END diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f index 2a5ced225..81e40efef 100644 --- a/lapack-netlib/SRC/cgesvj.f +++ b/lapack-netlib/SRC/cgesvj.f @@ -89,12 +89,12 @@ *> Specifies whether to compute the right singular vectors, that *> is, the matrix V: *> = 'V' or 'J': the matrix V is computed and returned in the array V -*> = 'A' : the Jacobi rotations are applied to the MV-by-N +*> = 'A': the Jacobi rotations are applied to the MV-by-N *> array V. In other words, the right singular vector *> matrix V is not computed explicitly; instead it is *> applied to an MV-by-N matrix initially stored in the *> first MV rows of V. -*> = 'N' : the matrix V is not computed and the array V is not +*> = 'N': the matrix V is not computed and the array V is not *> referenced *> \endverbatim *> @@ -116,8 +116,8 @@ *> A is COMPLEX array, dimension (LDA,N) *> On entry, the M-by-N matrix A. *> On exit, -*> If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C': -*> If INFO .EQ. 0 : +*> If JOBU = 'U' .OR. JOBU = 'C': +*> If INFO = 0 : *> RANKA orthonormal columns of U are returned in the *> leading RANKA columns of the array A. Here RANKA <= N *> is the number of computed singular values of A that are @@ -127,9 +127,9 @@ *> in the array RWORK as RANKA=NINT(RWORK(2)). Also see the *> descriptions of SVA and RWORK. The computed columns of U *> are mutually numerically orthogonal up to approximately -*> TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'), +*> TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'), *> see the description of JOBU. -*> If INFO .GT. 0, +*> If INFO > 0, *> the procedure CGESVJ did not converge in the given number *> of iterations (sweeps). In that case, the computed *> columns of U may not be orthogonal up to TOL. The output @@ -137,8 +137,8 @@ *> values in SVA(1:N)) and V is still a decomposition of the *> input matrix A in the sense that the residual *> || A - SCALE * U * SIGMA * V^* ||_2 / ||A||_2 is small. -*> If JOBU .EQ. 'N': -*> If INFO .EQ. 0 : +*> If JOBU = 'N': +*> If INFO = 0 : *> Note that the left singular vectors are 'for free' in the *> one-sided Jacobi SVD algorithm. However, if only the *> singular values are needed, the level of numerical @@ -147,7 +147,7 @@ *> numerically orthogonal up to approximately M*EPS. Thus, *> on exit, A contains the columns of U scaled with the *> corresponding singular values. -*> If INFO .GT. 0 : +*> If INFO > 0 : *> the procedure CGESVJ did not converge in the given number *> of iterations (sweeps). *> \endverbatim @@ -162,9 +162,9 @@ *> \verbatim *> SVA is REAL array, dimension (N) *> On exit, -*> If INFO .EQ. 0 : +*> If INFO = 0 : *> depending on the value SCALE = RWORK(1), we have: -*> If SCALE .EQ. ONE: +*> If SCALE = ONE: *> SVA(1:N) contains the computed singular values of A. *> During the computation SVA contains the Euclidean column *> norms of the iterated matrices in the array A. @@ -173,7 +173,7 @@ *> factored representation is due to the fact that some of the *> singular values of A might underflow or overflow. *> -*> If INFO .GT. 0 : +*> If INFO > 0 : *> the procedure CGESVJ did not converge in the given number of *> iterations (sweeps) and SCALE*SVA(1:N) may not be accurate. *> \endverbatim @@ -181,7 +181,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then the product of Jacobi rotations in CGESVJ +*> If JOBV = 'A', then the product of Jacobi rotations in CGESVJ *> is applied to the first MV rows of V. See the description of JOBV. *> \endverbatim *> @@ -199,16 +199,16 @@ *> \param[in] LDV *> \verbatim *> LDV is INTEGER -*> The leading dimension of the array V, LDV .GE. 1. -*> If JOBV .EQ. 'V', then LDV .GE. max(1,N). -*> If JOBV .EQ. 'A', then LDV .GE. max(1,MV) . +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V', then LDV >= max(1,N). +*> If JOBV = 'A', then LDV >= max(1,MV) . *> \endverbatim *> *> \param[in,out] CWORK *> \verbatim *> CWORK is COMPLEX array, dimension (max(1,LWORK)) *> Used as workspace. -*> If on entry LWORK .EQ. -1, then a workspace query is assumed and +*> If on entry LWORK = -1, then a workspace query is assumed and *> no computation is done; CWORK(1) is set to the minial (and optimal) *> length of CWORK. *> \endverbatim @@ -223,7 +223,7 @@ *> \verbatim *> RWORK is REAL array, dimension (max(6,LRWORK)) *> On entry, -*> If JOBU .EQ. 'C' : +*> If JOBU = 'C' : *> RWORK(1) = CTOL, where CTOL defines the threshold for convergence. *> The process stops if all columns of A are mutually *> orthogonal up to CTOL*EPS, EPS=SLAMCH('E'). @@ -243,11 +243,11 @@ *> RWORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep. *> This is useful information in cases when CGESVJ did *> not converge, as it can be used to estimate whether -*> the output is stil useful and for post festum analysis. +*> the output is still useful and for post festum analysis. *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK .EQ. -1, then a workspace query is assumed and +*> If on entry LRWORK = -1, then a workspace query is assumed and *> no computation is done; RWORK(1) is set to the minial (and optimal) *> length of RWORK. *> \endverbatim @@ -261,9 +261,9 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value -*> > 0 : CGESVJ did not converge in the maximal allowed number +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value +*> > 0: CGESVJ did not converge in the maximal allowed number *> (NSWEEP=30) of sweeps. The output may still be useful. *> See the description of RWORK. *> \endverbatim diff --git a/lapack-netlib/SRC/cgesvxx.f b/lapack-netlib/SRC/cgesvxx.f index 30d1beb33..383e4d011 100644 --- a/lapack-netlib/SRC/cgesvxx.f +++ b/lapack-netlib/SRC/cgesvxx.f @@ -411,7 +411,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -447,14 +447,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -462,9 +462,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index e7c5d8120..3d783be66 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -1,3 +1,5 @@ +*> \brief \b CGETSLS +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/cggesx.f b/lapack-netlib/SRC/cggesx.f index 74169ff80..acc4eda36 100644 --- a/lapack-netlib/SRC/cggesx.f +++ b/lapack-netlib/SRC/cggesx.f @@ -120,10 +120,10 @@ *> \verbatim *> SENSE is CHARACTER*1 *> Determines which reciprocal condition numbers are computed. -*> = 'N' : None are computed; -*> = 'E' : Computed for average of selected eigenvalues only; -*> = 'V' : Computed for selected deflating subspaces only; -*> = 'B' : Computed for both. +*> = 'N': None are computed; +*> = 'E': Computed for average of selected eigenvalues only; +*> = 'V': Computed for selected deflating subspaces only; +*> = 'B': Computed for both. *> If SENSE = 'E', 'V', or 'B', SORT must equal 'S'. *> \endverbatim *> diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f index 80e67a06e..810df3367 100644 --- a/lapack-netlib/SRC/cgsvj0.f +++ b/lapack-netlib/SRC/cgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multipled by a *> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim @@ -136,8 +136,8 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -157,7 +157,7 @@ *> TOL is REAL *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -175,14 +175,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f index bebcd5c45..06b417cf2 100644 --- a/lapack-netlib/SRC/cgsvj1.f +++ b/lapack-netlib/SRC/cgsvj1.f @@ -61,7 +61,7 @@ *> In terms of the columns of A, the first N1 columns are rotated 'against' *> the remaining N-N1 columns, trying to increase the angle between the *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is -*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter. +*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter. *> The number of sweeps is given in NSWEEP and the orthogonality threshold *> is given in TOL. *> \endverbatim @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multipled by a *> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim @@ -166,8 +166,8 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -187,7 +187,7 @@ *> TOL is REAL *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -205,14 +205,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/chb2st_kernels.f b/lapack-netlib/SRC/chb2st_kernels.f index 25c9ab717..01ea217bb 100644 --- a/lapack-netlib/SRC/chb2st_kernels.f +++ b/lapack-netlib/SRC/chb2st_kernels.f @@ -1,26 +1,26 @@ *> \brief \b CHB2ST_KERNELS * * @generated from zhb2st_kernels.f, fortran z -> c, Wed Dec 7 08:22:40 2016 -* +* * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download CHB2ST_KERNELS + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download CHB2ST_KERNELS + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, * ST, ED, SWEEP, N, NB, IB, * A, LDA, V, TAU, LDVT, WORK) * @@ -32,9 +32,9 @@ * INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. -* COMPLEX A( LDA, * ), V( * ), +* COMPLEX A( LDA, * ), V( * ), * TAU( * ), WORK( * ) -* +* *> \par Purpose: * ============= *> @@ -124,7 +124,7 @@ *> LDVT is INTEGER. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array. Workspace of size nb. *> \endverbatim @@ -147,7 +147,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -155,16 +155,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, + SUBROUTINE CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, $ ST, ED, SWEEP, N, NB, IB, $ A, LDA, V, TAU, LDVT, WORK) * @@ -181,7 +181,7 @@ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), V( * ), + COMPLEX A( LDA, * ), V( * ), $ TAU( * ), WORK( * ) * .. * @@ -195,8 +195,8 @@ * .. Local Scalars .. LOGICAL UPPER INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, - $ DPOS, OFDPOS, AJETER - COMPLEX CTMP + $ DPOS, OFDPOS, AJETER + COMPLEX CTMP * .. * .. External Subroutines .. EXTERNAL CLARFG, CLARFX, CLARFY @@ -209,7 +209,7 @@ * .. * .. * .. Executable Statements .. -* +* AJETER = IB + LDVT UPPER = LSAME( UPLO, 'U' ) @@ -240,10 +240,10 @@ V( VPOS ) = ONE DO 10 I = 1, LM-1 V( VPOS+I ) = CONJG( A( OFDPOS-I, ST+I ) ) - A( OFDPOS-I, ST+I ) = ZERO + A( OFDPOS-I, ST+I ) = ZERO 10 CONTINUE CTMP = CONJG( A( OFDPOS, ST ) ) - CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, + CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) A( OFDPOS, ST ) = CTMP * @@ -281,14 +281,14 @@ * V( VPOS ) = ONE DO 30 I = 1, LM-1 - V( VPOS+I ) = + V( VPOS+I ) = $ CONJG( A( DPOS-NB-I, J1+I ) ) A( DPOS-NB-I, J1+I ) = ZERO 30 CONTINUE CTMP = CONJG( A( DPOS-NB, J1 ) ) CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) A( DPOS-NB, J1 ) = CTMP -* +* CALL CLARFX( 'Right', LN-1, LM, V( VPOS ), $ TAU( TAUPOS ), $ A( DPOS-NB+1, J1 ), LDA-1, WORK) @@ -296,9 +296,9 @@ ENDIF * * Lower case -* +* ELSE -* +* IF( WANTZ ) THEN VPOS = MOD( SWEEP-1, 2 ) * N + ST TAUPOS = MOD( SWEEP-1, 2 ) * N + ST @@ -313,9 +313,9 @@ V( VPOS ) = ONE DO 20 I = 1, LM-1 V( VPOS+I ) = A( OFDPOS+I, ST-1 ) - A( OFDPOS+I, ST-1 ) = ZERO + A( OFDPOS+I, ST-1 ) = ZERO 20 CONTINUE - CALL CLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + CALL CLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * LM = ED - ST + 1 @@ -342,7 +342,7 @@ LM = J2-J1+1 * IF( LM.GT.0) THEN - CALL CLARFX( 'Right', LM, LN, V( VPOS ), + CALL CLARFX( 'Right', LM, LN, V( VPOS ), $ TAU( TAUPOS ), A( DPOS+NB, ST ), $ LDA-1, WORK) * @@ -359,13 +359,13 @@ V( VPOS+I ) = A( DPOS+NB+I, ST ) A( DPOS+NB+I, ST ) = ZERO 40 CONTINUE - CALL CLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + CALL CLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * - CALL CLARFX( 'Left', LM, LN-1, V( VPOS ), + CALL CLARFX( 'Left', LM, LN-1, V( VPOS ), $ CONJG( TAU( TAUPOS ) ), $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) - + ENDIF ENDIF ENDIF @@ -374,4 +374,4 @@ * * END OF CHB2ST_KERNELS * - END + END diff --git a/lapack-netlib/SRC/checon_3.f b/lapack-netlib/SRC/checon_3.f index 6427dd594..5d9ed97e9 100644 --- a/lapack-netlib/SRC/checon_3.f +++ b/lapack-netlib/SRC/checon_3.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE CHECON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND, -* WORK, IWORK, INFO ) +* WORK, INFO ) * * .. Scalar Arguments .. * CHARACTER UPLO @@ -27,7 +27,7 @@ * REAL ANORM, RCOND * .. * .. Array Arguments .. -* INTEGER IPIV( * ), IWORK( * ) +* INTEGER IPIV( * ) * COMPLEX A( LDA, * ), E ( * ), WORK( * ) * .. * @@ -129,11 +129,6 @@ *> WORK is COMPLEX array, dimension (2*N) *> \endverbatim *> -*> \param[out] IWORK -*> \verbatim -*> IWORK is INTEGER array, dimension (N) -*> \endverbatim -*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f index 0b055baf6..c5deb1166 100644 --- a/lapack-netlib/SRC/cheevr.f +++ b/lapack-netlib/SRC/cheevr.f @@ -210,7 +210,7 @@ *> eigenvalues are computed to high relative accuracy when *> possible in future releases. The current code does not *> make any guarantees about high relative accuracy, but -*> furutre releases will. See J. Barlow and J. Demmel, +*> future releases will. See J. Barlow and J. Demmel, *> "Computing Accurate Eigensystems of Scaled Diagonally *> Dominant Matrices", LAPACK Working Note #7, for a discussion *> of which matrices define their eigenvalues to high relative diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f index 20a1cb3f3..1489a322e 100644 --- a/lapack-netlib/SRC/cheevr_2stage.f +++ b/lapack-netlib/SRC/cheevr_2stage.f @@ -217,7 +217,7 @@ *> eigenvalues are computed to high relative accuracy when *> possible in future releases. The current code does not *> make any guarantees about high relative accuracy, but -*> furutre releases will. See J. Barlow and J. Demmel, +*> future releases will. See J. Barlow and J. Demmel, *> "Computing Accurate Eigensystems of Scaled Diagonally *> Dominant Matrices", LAPACK Working Note #7, for a discussion *> of which matrices define their eigenvalues to high relative diff --git a/lapack-netlib/SRC/chegs2.f b/lapack-netlib/SRC/chegs2.f index 68d2f6625..55a895fc3 100644 --- a/lapack-netlib/SRC/chegs2.f +++ b/lapack-netlib/SRC/chegs2.f @@ -97,6 +97,7 @@ *> B is COMPLEX array, dimension (LDB,N) *> The triangular factor from the Cholesky factorization of B, *> as returned by CPOTRF. +*> B is modified by the routine but restored on exit. *> \endverbatim *> *> \param[in] LDB diff --git a/lapack-netlib/SRC/chegst.f b/lapack-netlib/SRC/chegst.f index 2f933729c..b3fdff2d5 100644 --- a/lapack-netlib/SRC/chegst.f +++ b/lapack-netlib/SRC/chegst.f @@ -97,6 +97,7 @@ *> B is COMPLEX array, dimension (LDB,N) *> The triangular factor from the Cholesky factorization of B, *> as returned by CPOTRF. +*> B is modified by the routine but restored on exit. *> \endverbatim *> *> \param[in] LDB diff --git a/lapack-netlib/SRC/cherfsx.f b/lapack-netlib/SRC/cherfsx.f index 4ed2c99f7..76cef7cd1 100644 --- a/lapack-netlib/SRC/cherfsx.f +++ b/lapack-netlib/SRC/cherfsx.f @@ -102,7 +102,7 @@ *> \param[in] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> The symmetric matrix A. If UPLO = 'U', the leading N-by-N +*> The Hermitian matrix A. If UPLO = 'U', the leading N-by-N *> upper triangular part of A contains the upper triangular *> part of the matrix A, and the strictly lower triangular *> part of A is not referenced. If UPLO = 'L', the leading @@ -270,7 +270,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -306,14 +306,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -321,9 +321,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 470f910bc..b934e624b 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -42,7 +42,7 @@ *> matrices. *> *> Aasen's algorithm is used to factor A as -*> A = U * T * U**H, if UPLO = 'U', or +*> A = U**H * T * U, if UPLO = 'U', or *> A = L * T * L**H, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is Hermitian and tridiagonal. The factored form @@ -86,7 +86,7 @@ *> *> On exit, if INFO = 0, the tridiagonal matrix T and the *> multipliers used to obtain the factor U or L from the -*> factorization A = U*T*U**H or A = L*T*L**H as computed by +*> factorization A = U**H*T*U or A = L*T*L**H as computed by *> CHETRF_AA. *> \endverbatim *> @@ -230,7 +230,7 @@ RETURN END IF * -* Compute the factorization A = U*T*U**H or A = L*T*L**H. +* Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) IF( INFO.EQ.0 ) THEN diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 05f6b7bb7..ab5786d57 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -43,7 +43,7 @@ *> matrices. *> *> Aasen's 2-stage algorithm is used to factor A as -*> A = U * T * U**H, if UPLO = 'U', or +*> A = U**H * T * U, if UPLO = 'U', or *> A = L * T * L**H, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is Hermitian and band. The matrix T is @@ -257,7 +257,7 @@ END IF * * -* Compute the factorization A = U*T*U**H or A = L*T*L**H. +* Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, $ WORK, LWORK, INFO ) diff --git a/lapack-netlib/SRC/chesvxx.f b/lapack-netlib/SRC/chesvxx.f index 3f4466d41..c59e72bbf 100644 --- a/lapack-netlib/SRC/chesvxx.f +++ b/lapack-netlib/SRC/chesvxx.f @@ -46,7 +46,7 @@ *> *> CHESVXX uses the diagonal pivoting factorization to compute the *> solution to a complex system of linear equations A * X = B, where -*> A is an N-by-N symmetric matrix and X and B are N-by-NRHS +*> A is an N-by-N Hermitian matrix and X and B are N-by-NRHS *> matrices. *> *> If requested, both normwise and maximum componentwise error bounds @@ -88,7 +88,7 @@ *> A = L * D * L**T, if UPLO = 'L', *> *> where U (or L) is a product of permutation and unit upper (lower) -*> triangular matrices, and D is symmetric and block diagonal with +*> triangular matrices, and D is Hermitian and block diagonal with *> 1-by-1 and 2-by-2 diagonal blocks. *> *> 3. If some D(i,i)=0, so that D is exactly singular, then the @@ -161,7 +161,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> The symmetric matrix A. If UPLO = 'U', the leading N-by-N +*> The Hermitian matrix A. If UPLO = 'U', the leading N-by-N *> upper triangular part of A contains the upper triangular *> part of the matrix A, and the strictly lower triangular *> part of A is not referenced. If UPLO = 'L', the leading @@ -378,7 +378,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -414,14 +414,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -429,9 +429,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/chetf2_rk.f b/lapack-netlib/SRC/chetf2_rk.f index 38a0ce373..80e2f61b7 100644 --- a/lapack-netlib/SRC/chetf2_rk.f +++ b/lapack-netlib/SRC/chetf2_rk.f @@ -322,7 +322,7 @@ * * Factorize A as U*D*U**H using the upper triangle of A * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -676,7 +676,7 @@ * * Factorize A as L*D*L**H using the lower triangle of A * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f index e7370a4dd..4575a5e90 100644 --- a/lapack-netlib/SRC/chetrd_2stage.f +++ b/lapack-netlib/SRC/chetrd_2stage.f @@ -123,23 +123,22 @@ *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX array, dimension LHOUS2, that -*> store the Householder representation of the stage2 +*> HOUS2 is COMPLEX array, dimension (LHOUS2) +*> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim *> *> \param[in] LHOUS2 *> \verbatim *> LHOUS2 is INTEGER -*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) +*> The dimension of the array HOUS2. *> If LWORK = -1, or LHOUS2=-1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns *> this value as the first entry of the HOUS2 array, and no error *> message related to LHOUS2 is issued by XERBLA. -*> LHOUS2 = MAX(1, dimension) where -*> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> If VECT='N', LHOUS2 = max(1, 4*n); +*> if VECT='V', option not yet available. *> \endverbatim *> *> \param[out] WORK @@ -151,7 +150,7 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 43da45640..a3d8259d3 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -50,9 +50,9 @@ * Arguments: * ========== * -*> \param[in] STAGE +*> \param[in] STAGE1 *> \verbatim -*> STAGE is CHARACTER*1 +*> STAGE1 is CHARACTER*1 *> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the chetrd_he2hb routine *> was not called before this routine to reproduce AB. diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index e334532fe..e85c1fd01 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -363,7 +363,7 @@ * * * Set the workspace of the triangular matrix T to zero once such a -* way everytime T is generated the upper/lower portion will be always zero +* way every time T is generated the upper/lower portion will be always zero * CALL CLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) * diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index 2c5564893..c6f548d42 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -37,7 +37,7 @@ *> CHETRF_AA computes the factorization of a complex hermitian matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**H or A = L*T*L**H +*> A = U**H*T*U or A = L*T*L**H *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a hermitian tridiagonal matrix. @@ -223,7 +223,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**H using the upper triangle of A +* Factorize A as U**H*D*U using the upper triangle of A * ..................................................... * * copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N)) @@ -256,7 +256,7 @@ $ A( MAX(1, J), J+1 ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J @@ -376,7 +376,7 @@ $ A( J+1, MAX(1, J) ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index ce34d73cc..d2e0e0023 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -38,7 +38,7 @@ *> CHETRF_AA_2STAGE computes the factorization of a real hermitian matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a hermitian band matrix with the @@ -277,7 +277,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * DO J = 0, NT-1 @@ -453,14 +453,17 @@ c END IF * > Apply pivots to previous columns of L CALL CSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, $ A( (J+1)*NB+1, I2 ), 1 ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA, - $ A( I1+1, I2 ), 1 ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) THEN + CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA, + $ A( I1+1, I2 ), 1 ) + CALL CLACGV( I2-I1-1, A( I1+1, I2 ), 1 ) + END IF CALL CLACGV( I2-I1, A( I1, I1+1 ), LDA ) - CALL CLACGV( I2-I1-1, A( I1+1, I2 ), 1 ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL CSWAP( N-I2, A( I1, I2+1 ), LDA, - $ A( I2, I2+1 ), LDA ) + IF( I2.LT.N ) + $ CALL CSWAP( N-I2, A( I1, I2+1 ), LDA, + $ A( I2, I2+1 ), LDA ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) @@ -630,14 +633,17 @@ c END IF * > Apply pivots to previous columns of L CALL CSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, $ A( I2, (J+1)*NB+1 ), LDA ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1, - $ A( I2, I1+1 ), LDA ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) THEN + CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1, + $ A( I2, I1+1 ), LDA ) + CALL CLACGV( I2-I1-1, A( I2, I1+1 ), LDA ) + END IF CALL CLACGV( I2-I1, A( I1+1, I1 ), 1 ) - CALL CLACGV( I2-I1-1, A( I2, I1+1 ), LDA ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL CSWAP( N-I2, A( I2+1, I1 ), 1, - $ A( I2+1, I2 ), 1 ) + IF( I2.LT.N ) + $ CALL CSWAP( N-I2, A( I2+1, I1 ), 1, + $ A( I2+1, I2 ), 1 ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f index 722d13008..1e18202cf 100644 --- a/lapack-netlib/SRC/chetri2.f +++ b/lapack-netlib/SRC/chetri2.f @@ -62,7 +62,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers +*> On entry, the block diagonal matrix D and the multipliers *> used to obtain the factor U or L as computed by CHETRF. *> *> On exit, if INFO = 0, the (symmetric) inverse of the original @@ -82,7 +82,7 @@ *> \param[in] IPIV *> \verbatim *> IPIV is INTEGER array, dimension (N) -*> Details of the interchanges and the NB structure of D +*> Details of the interchanges and the block structure of D *> as determined by CHETRF. *> \endverbatim *> diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f index 50e5692db..877517031 100644 --- a/lapack-netlib/SRC/chetrs_aa.f +++ b/lapack-netlib/SRC/chetrs_aa.f @@ -37,7 +37,7 @@ *> \verbatim *> *> CHETRS_AA solves a system of linear equations A*X = B with a complex -*> hermitian matrix A using the factorization A = U*T*U**H or +*> hermitian matrix A using the factorization A = U**H*T*U or *> A = L*T*L**H computed by CHETRF_AA. *> \endverbatim * @@ -49,7 +49,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**H; +*> = 'U': Upper triangular, form is A = U**H*T*U; *> = 'L': Lower triangular, form is A = L*T*L**H. *> \endverbatim *> @@ -97,14 +97,16 @@ *> The leading dimension of the array B. LDB >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim -*> WORK is DOUBLE array, dimension (MAX(1,LWORK)) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER, LWORK >= MAX(1,3*N-2). +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -198,24 +200,31 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**H*T*U. +* +* 1) Forward substitution with U**H +* + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B * -* P**T * B + K = 1 + DO WHILE ( K.LE.N ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K + 1 + END DO * - K = 1 - DO WHILE ( K.LE.N ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K + 1 - END DO +* Compute U**H \ B -> B [ (U**H \P**T * B) ] * -* Compute (U \P**T * B) -> B [ (U \P**T * B) ] + CALL CTRSM( 'L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) + END IF * - CALL CTRSM('L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) +* 2) Solve with triangular matrix T * -* Compute T \ B -> B [ T \ (U \P**T * B) ] +* Compute T \ B -> B [ T \ (U**H \P**T * B) ] * CALL CLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1) IF( N.GT.1 ) THEN @@ -226,65 +235,82 @@ CALL CGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB, $ INFO) * -* Compute (U**T \ B) -> B [ U**T \ (T \ (U \P**T * B) ) ] +* 3) Backward substitution with U +* + IF( N.GT.1 ) THEN * - CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B(2, 1), LDB) +* Compute U \ B -> B [ U \ (T \ (U**H \P**T * B) ) ] * -* Pivot, P * B [ P * (U**T \ (T \ (U \P**T * B) )) ] + CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B(2, 1), LDB) * - K = N - DO WHILE ( K.GE.1 ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K - 1 - END DO +* Pivot, P * B -> B [ P * (U \ (T \ (U**H \P**T * B) )) ] +* + K = N + DO WHILE ( K.GE.1 ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K - 1 + END DO + END IF * ELSE * -* Solve A*X = B, where A = L*T*L**T. +* Solve A*X = B, where A = L*T*L**H. * -* Pivot, P**T * B +* 1) Forward substitution with L * - K = 1 - DO WHILE ( K.LE.N ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K + 1 - END DO + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B +* + K = 1 + DO WHILE ( K.LE.N ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K + 1 + END DO * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute L \ B -> B [ (L \P**T * B) ] +* + CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1), + $ LDA, B(2, 1), LDB ) + END IF * - CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1), LDA, - $ B(2, 1), LDB) +* 2) Solve with triangular matrix T * * Compute T \ B -> B [ T \ (L \P**T * B) ] * CALL CLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1) IF( N.GT.1 ) THEN - CALL CLACPY( 'F', 1, N-1, A( 2, 1 ), LDA+1, WORK( 1 ), 1) + CALL CLACPY( 'F', 1, N-1, A( 2, 1 ), LDA+1, WORK( 1 ), 1 ) CALL CLACPY( 'F', 1, N-1, A( 2, 1 ), LDA+1, WORK( 2*N ), 1) CALL CLACGV( N-1, WORK( 2*N ), 1 ) END IF CALL CGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB, $ INFO) * -* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] +* 3) Backward substitution with L**H * - CALL CTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) + IF( N.GT.1 ) THEN +* +* Compute (L**H \ B) -> B [ L**H \ (T \ (L \P**T * B) ) ] * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] + CALL CTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB ) * - K = N - DO WHILE ( K.GE.1 ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K - 1 - END DO +* Pivot, P * B -> B [ P * (L**H \ (T \ (L \P**T * B) )) ] +* + K = N + DO WHILE ( K.GE.1 ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K - 1 + END DO + END IF * END IF * diff --git a/lapack-netlib/SRC/chetrs_aa_2stage.f b/lapack-netlib/SRC/chetrs_aa_2stage.f index 05d09275b..979d80a7c 100644 --- a/lapack-netlib/SRC/chetrs_aa_2stage.f +++ b/lapack-netlib/SRC/chetrs_aa_2stage.f @@ -38,7 +38,7 @@ *> \verbatim *> *> CHETRS_AA_2STAGE solves a system of linear equations A*X = B with a real -*> hermitian matrix A using the factorization A = U*T*U**T or +*> hermitian matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by CHETRF_AA_2STAGE. *> \endverbatim * @@ -50,7 +50,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -210,15 +210,15 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (U**T \P**T * B) -> B [ (U**T \P**T * B) ] +* Compute (U**T \ B) -> B [ (U**T \P**T * B) ] * CALL CTRSM( 'L', 'U', 'C', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f index 34bf49249..cfcf725b2 100644 --- a/lapack-netlib/SRC/chseqr.f +++ b/lapack-netlib/SRC/chseqr.f @@ -69,7 +69,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -86,7 +86,7 @@ *> set by a previous call to CGEBAL, and then passed to ZGEHRD *> when the matrix output by CGEBAL is reduced to Hessenberg *> form. Otherwise ILO and IHI should be set to 1 and N -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -98,17 +98,17 @@ *> triangular matrix T from the Schur decomposition (the *> Schur form). If INFO = 0 and JOB = 'E', the contents of *> H are unspecified on exit. (The output value of H when -*> INFO.GT.0 is given under the description of INFO below.) +*> INFO > 0 is given under the description of INFO below.) *> *> Unlike earlier versions of CHSEQR, this subroutine may -*> explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1 +*> explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1 *> or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] W @@ -131,7 +131,7 @@ *> if INFO = 0, Z contains Q*Z. *> Normally Q is the unitary matrix generated by CUNGHR *> after the call to CGEHRD which formed the Hessenberg matrix -*> H. (The output value of Z when INFO.GT.0 is given under +*> H. (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -139,7 +139,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if COMPZ = 'I' or -*> COMPZ = 'V', then LDZ.GE.MAX(1,N). Otherwize, LDZ.GE.1. +*> COMPZ = 'V', then LDZ >= MAX(1,N). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -152,7 +152,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient and delivers very good and sometimes *> optimal performance. However, LWORK as large as 11*N *> may be required for optimal performance. A workspace @@ -170,21 +170,21 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .LT. 0: if INFO = -i, the i-th argument had an illegal +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal *> value -*> .GT. 0: if INFO = i, CHSEQR failed to compute all of -*> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR -*> and WI contain those eigenvalues which have been +*> > 0: if INFO = i, CHSEQR failed to compute all of +*> the eigenvalues. Elements 1:ilo-1 and i+1:n of W +*> contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and JOB = 'E', then on exit, the +*> If INFO > 0 and JOB = 'E', then on exit, the *> remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and JOB = 'S', then on exit +*> If INFO > 0 and JOB = 'S', then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -192,19 +192,19 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and COMPZ = 'V', then on exit +*> If INFO > 0 and COMPZ = 'V', then on exit *> *> (final value of Z) = (initial value of Z)*U *> *> where U is the unitary matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'I', then on exit +*> If INFO > 0 and COMPZ = 'I', then on exit *> (final value of Z) = U *> where U is the unitary matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'N', then Z is not +*> If INFO > 0 and COMPZ = 'N', then Z is not *> accessed. *> \endverbatim * @@ -244,8 +244,8 @@ *> This depends on ILO, IHI and NS. NS is the *> number of simultaneous shifts returned *> by ILAENV(ISPEC=15). (See ISPEC=15 below.) -*> The default for (IHI-ILO+1).LE.500 is NS. -*> The default for (IHI-ILO+1).GT.500 is 3*NS/2. +*> The default for (IHI-ILO+1) <= 500 is NS. +*> The default for (IHI-ILO+1) > 500 is 3*NS/2. *> *> ISPEC=14: Nibble crossover point. (See IPARMQ for *> details.) Default: 14% of deflation window @@ -323,8 +323,8 @@ PARAMETER ( NTINY = 11 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare CLAHQR failure. NL .GT. NTINY = 11 is -* . required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom- +* . through a rare CLAHQR failure. NL > NTINY = 11 is +* . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 * . deflation window. ==== diff --git a/lapack-netlib/SRC/cla_gbrcond_c.f b/lapack-netlib/SRC/cla_gbrcond_c.f index 123aee26e..c382ac210 100644 --- a/lapack-netlib/SRC/cla_gbrcond_c.f +++ b/lapack-netlib/SRC/cla_gbrcond_c.f @@ -132,13 +132,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_gbrcond_x.f b/lapack-netlib/SRC/cla_gbrcond_x.f index d04aa7fb8..46991ea14 100644 --- a/lapack-netlib/SRC/cla_gbrcond_x.f +++ b/lapack-netlib/SRC/cla_gbrcond_x.f @@ -125,13 +125,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.f b/lapack-netlib/SRC/cla_gbrfsx_extended.f index 888ecd4f7..9f066137b 100644 --- a/lapack-netlib/SRC/cla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/cla_gbrfsx_extended.f @@ -65,19 +65,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -269,7 +269,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/cla_gercond_c.f b/lapack-netlib/SRC/cla_gercond_c.f index aabdc0bb9..1a2e8230e 100644 --- a/lapack-netlib/SRC/cla_gercond_c.f +++ b/lapack-netlib/SRC/cla_gercond_c.f @@ -21,7 +21,7 @@ * REAL FUNCTION CLA_GERCOND_C( TRANS, N, A, LDA, AF, LDAF, IPIV, C, * CAPPLY, INFO, WORK, RWORK ) * -* .. Scalar Aguments .. +* .. Scalar Arguments .. * CHARACTER TRANS * LOGICAL CAPPLY * INTEGER N, LDA, LDAF, INFO @@ -114,13 +114,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. @@ -147,7 +147,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * -* .. Scalar Aguments .. +* .. Scalar Arguments .. CHARACTER TRANS LOGICAL CAPPLY INTEGER N, LDA, LDAF, INFO diff --git a/lapack-netlib/SRC/cla_gercond_x.f b/lapack-netlib/SRC/cla_gercond_x.f index 6dce99f62..46e9b039f 100644 --- a/lapack-netlib/SRC/cla_gercond_x.f +++ b/lapack-netlib/SRC/cla_gercond_x.f @@ -107,13 +107,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.f b/lapack-netlib/SRC/cla_gerfsx_extended.f index 2e0596334..d231733e6 100644 --- a/lapack-netlib/SRC/cla_gerfsx_extended.f +++ b/lapack-netlib/SRC/cla_gerfsx_extended.f @@ -65,19 +65,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -257,7 +257,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERRS_C is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERRS_C is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERRS_C(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/cla_hercond_c.f b/lapack-netlib/SRC/cla_hercond_c.f index a5ebaf8a2..5f26822af 100644 --- a/lapack-netlib/SRC/cla_hercond_c.f +++ b/lapack-netlib/SRC/cla_hercond_c.f @@ -110,13 +110,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_hercond_x.f b/lapack-netlib/SRC/cla_hercond_x.f index f0004102f..91c80a668 100644 --- a/lapack-netlib/SRC/cla_hercond_x.f +++ b/lapack-netlib/SRC/cla_hercond_x.f @@ -103,13 +103,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_herfsx_extended.f b/lapack-netlib/SRC/cla_herfsx_extended.f index c69589dfa..d1aa8462c 100644 --- a/lapack-netlib/SRC/cla_herfsx_extended.f +++ b/lapack-netlib/SRC/cla_herfsx_extended.f @@ -66,11 +66,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -254,7 +254,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/cla_porcond_c.f b/lapack-netlib/SRC/cla_porcond_c.f index 7a2bcfe63..c2356590f 100644 --- a/lapack-netlib/SRC/cla_porcond_c.f +++ b/lapack-netlib/SRC/cla_porcond_c.f @@ -102,13 +102,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_porcond_x.f b/lapack-netlib/SRC/cla_porcond_x.f index f0844ec89..a5ff3aa61 100644 --- a/lapack-netlib/SRC/cla_porcond_x.f +++ b/lapack-netlib/SRC/cla_porcond_x.f @@ -95,13 +95,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_porfsx_extended.f b/lapack-netlib/SRC/cla_porfsx_extended.f index 3a3409c9e..545bdc445 100644 --- a/lapack-netlib/SRC/cla_porfsx_extended.f +++ b/lapack-netlib/SRC/cla_porfsx_extended.f @@ -65,11 +65,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -246,7 +246,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/cla_porpvgrw.f b/lapack-netlib/SRC/cla_porpvgrw.f index bd2e7af1c..f10299c5a 100644 --- a/lapack-netlib/SRC/cla_porpvgrw.f +++ b/lapack-netlib/SRC/cla_porpvgrw.f @@ -85,7 +85,7 @@ *> The leading dimension of the array AF. LDAF >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/cla_syrcond_c.f b/lapack-netlib/SRC/cla_syrcond_c.f index fc52bf23b..e59e83aa6 100644 --- a/lapack-netlib/SRC/cla_syrcond_c.f +++ b/lapack-netlib/SRC/cla_syrcond_c.f @@ -110,13 +110,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_syrcond_x.f b/lapack-netlib/SRC/cla_syrcond_x.f index f8fb566e7..3edf58f83 100644 --- a/lapack-netlib/SRC/cla_syrcond_x.f +++ b/lapack-netlib/SRC/cla_syrcond_x.f @@ -103,13 +103,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is REAL array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.f b/lapack-netlib/SRC/cla_syrfsx_extended.f index 5d2fa0cbb..92243abcb 100644 --- a/lapack-netlib/SRC/cla_syrfsx_extended.f +++ b/lapack-netlib/SRC/cla_syrfsx_extended.f @@ -66,11 +66,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -254,7 +254,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/cla_syrpvgrw.f b/lapack-netlib/SRC/cla_syrpvgrw.f index ccea462c7..15e55ea7d 100644 --- a/lapack-netlib/SRC/cla_syrpvgrw.f +++ b/lapack-netlib/SRC/cla_syrpvgrw.f @@ -102,7 +102,7 @@ *> as determined by CSYTRF. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/cla_wwaddw.f b/lapack-netlib/SRC/cla_wwaddw.f index 9267c6df2..08e45ac79 100644 --- a/lapack-netlib/SRC/cla_wwaddw.f +++ b/lapack-netlib/SRC/cla_wwaddw.f @@ -36,7 +36,7 @@ *> CLA_WWADDW adds a vector W into a doubled-single vector (X, Y). *> *> This works for all extant IBM's hex and binary floating point -*> arithmetics, but not for decimal. +*> arithmetic, but not for decimal. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/clahef_aa.f b/lapack-netlib/SRC/clahef_aa.f index 88bc3d216..934aa92f9 100644 --- a/lapack-netlib/SRC/clahef_aa.f +++ b/lapack-netlib/SRC/clahef_aa.f @@ -288,8 +288,9 @@ * * Swap A(I1, I2+1:N) with A(I2, I2+1:N) * - CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, - $ A( J1+I2-1, I2+1 ), LDA ) + IF( I2.LT.M ) + $ CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, + $ A( J1+I2-1, I2+1 ), LDA ) * * Swap A(I1, I1) with A(I2,I2) * @@ -329,13 +330,15 @@ * Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1) * - IF( A( K, J+1 ).NE.ZERO ) THEN - ALPHA = ONE / A( K, J+1 ) - CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) - CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) - ELSE - CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO, - $ A( K, J+2 ), LDA) + IF( J.LT.(M-1) ) THEN + IF( A( K, J+1 ).NE.ZERO ) THEN + ALPHA = ONE / A( K, J+1 ) + CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) + CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) + ELSE + CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO, + $ A( K, J+2 ), LDA) + END IF END IF END IF J = J + 1 @@ -440,8 +443,9 @@ * * Swap A(I2+1:N, I1) with A(I2+1:N, I2) * - CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, - $ A( I2+1, J1+I2-1 ), 1 ) + IF( I2.LT.M ) + $ CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, + $ A( I2+1, J1+I2-1 ), 1 ) * * Swap A(I1, I1) with A(I2, I2) * @@ -481,13 +485,15 @@ * Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1) * - IF( A( J+1, K ).NE.ZERO ) THEN - ALPHA = ONE / A( J+1, K ) - CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) - CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) - ELSE - CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO, - $ A( J+2, K ), LDA ) + IF( J.LT.(M-1) ) THEN + IF( A( J+1, K ).NE.ZERO ) THEN + ALPHA = ONE / A( J+1, K ) + CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) + CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) + ELSE + CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO, + $ A( J+2, K ), LDA ) + END IF END IF END IF J = J + 1 diff --git a/lapack-netlib/SRC/clahef_rk.f b/lapack-netlib/SRC/clahef_rk.f index 4d9dfbe8e..cc4603e9b 100644 --- a/lapack-netlib/SRC/clahef_rk.f +++ b/lapack-netlib/SRC/clahef_rk.f @@ -331,7 +331,7 @@ * of A and working backwards, and compute the matrix W = U12*D * for use in updating A11 (note that conjg(W) is actually stored) * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -789,7 +789,7 @@ * of A and working forwards, and compute the matrix W = L21*D * for use in updating A22 (note that conjg(W) is actually stored) * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/clahqr.f b/lapack-netlib/SRC/clahqr.f index de2b3938b..ef50b5a56 100644 --- a/lapack-netlib/SRC/clahqr.f +++ b/lapack-netlib/SRC/clahqr.f @@ -138,26 +138,26 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, CLAHQR failed to compute all the +*> = 0: successful exit +*> > 0: if INFO = i, CLAHQR failed to compute all the *> eigenvalues ILO to IHI in a total of 30 iterations *> per eigenvalue; elements i+1:ihi of W contain *> those eigenvalues which have been successfully *> computed. *> -*> If INFO .GT. 0 and WANTT is .FALSE., then on exit, +*> If INFO > 0 and WANTT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the *> eigenvalues of the upper Hessenberg matrix -*> rows and columns ILO thorugh INFO of the final, +*> rows and columns ILO through INFO of the final, *> output value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> (*) (initial value of H)*U = U*(final value of H) -*> where U is an orthognal matrix. The final +*> where U is an orthogonal matrix. The final *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> (final value of Z) = (initial value of Z)*U *> where U is the orthogonal matrix in (*) *> (regardless of the value of WANTT.) diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f index f2f9ab7f9..f6909b666 100644 --- a/lapack-netlib/SRC/clamswlq.f +++ b/lapack-netlib/SRC/clamswlq.f @@ -1,3 +1,4 @@ +*> \brief \b CLAMSWLQ * * Definition: * =========== diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f index 77d09a573..c71e4aa7d 100644 --- a/lapack-netlib/SRC/clamtsqr.f +++ b/lapack-netlib/SRC/clamtsqr.f @@ -1,3 +1,4 @@ +*> \brief \b CLAMTSQR * * Definition: * =========== diff --git a/lapack-netlib/SRC/clangb.f b/lapack-netlib/SRC/clangb.f index 14a163ea7..9818360fe 100644 --- a/lapack-netlib/SRC/clangb.f +++ b/lapack-netlib/SRC/clangb.f @@ -130,6 +130,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER KL, KU, LDAB, N @@ -147,14 +148,17 @@ * .. * .. Local Scalars .. INTEGER I, J, K, L - REAL SCALE, SUM, VALUE, TEMP + REAL SUM, VALUE, TEMP +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -207,15 +211,22 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N L = MAX( 1, J-KU ) K = KU + 1 - J + L - CALL CLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANGB = VALUE diff --git a/lapack-netlib/SRC/clange.f b/lapack-netlib/SRC/clange.f index 50f705a18..00895c8bc 100644 --- a/lapack-netlib/SRC/clange.f +++ b/lapack-netlib/SRC/clange.f @@ -120,6 +120,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, M, N @@ -137,14 +138,17 @@ * .. * .. Local Scalars .. INTEGER I, J - REAL SCALE, SUM, VALUE, TEMP + REAL SUM, VALUE, TEMP +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT @@ -196,13 +200,19 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL CLASSQ( M, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANGE = VALUE diff --git a/lapack-netlib/SRC/clanhb.f b/lapack-netlib/SRC/clanhb.f index 2b034b19b..f78de23df 100644 --- a/lapack-netlib/SRC/clanhb.f +++ b/lapack-netlib/SRC/clanhb.f @@ -137,6 +137,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER K, LDAB, N @@ -154,14 +155,17 @@ * .. * .. Local Scalars .. INTEGER I, J, L - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL, SQRT @@ -233,39 +237,57 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( K.GT.0 ) THEN IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL CLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE L = K + 1 ELSE DO 120 J = 1, N - 1 - CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE L = 1 END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) ELSE L = 1 END IF +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 J = 1, N IF( REAL( AB( L, J ) ).NE.ZERO ) THEN ABSA = ABS( REAL( AB( L, J ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANHB = VALUE diff --git a/lapack-netlib/SRC/clanhe.f b/lapack-netlib/SRC/clanhe.f index 101d778eb..33d6c8b01 100644 --- a/lapack-netlib/SRC/clanhe.f +++ b/lapack-netlib/SRC/clanhe.f @@ -129,6 +129,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER LDA, N @@ -146,14 +147,17 @@ * .. * .. Local Scalars .. INTEGER I, J - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, SQRT @@ -223,31 +227,48 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL CLASSQ( J-1, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( J-1, A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL CLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N-J, A( J+1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* DO 130 I = 1, N IF( REAL( A( I, I ) ).NE.ZERO ) THEN ABSA = ABS( REAL( A( I, I ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( SSQ( 1 ).LT.ABSA ) THEN + SSQ( 2 ) = ONE + SSQ( 2 )*( SSQ( 1 ) / ABSA )**2 + SSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + SSQ( 2 ) = SSQ( 2 ) + ( ABSA / SSQ( 1 ) )**2 END IF END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANHE = VALUE diff --git a/lapack-netlib/SRC/clanhp.f b/lapack-netlib/SRC/clanhp.f index c8927d503..e0e23abc7 100644 --- a/lapack-netlib/SRC/clanhp.f +++ b/lapack-netlib/SRC/clanhp.f @@ -122,6 +122,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER N @@ -139,14 +140,17 @@ * .. * .. Local Scalars .. INTEGER I, J, K - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, SQRT @@ -225,31 +229,48 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE K = 2 IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL CLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + J 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL CLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* K = 1 + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 I = 1, N IF( REAL( AP( K ) ).NE.ZERO ) THEN ABSA = ABS( REAL( AP( K ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( LSAME( UPLO, 'U' ) ) THEN @@ -258,7 +279,8 @@ K = K + N - I + 1 END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANHP = VALUE diff --git a/lapack-netlib/SRC/clanhs.f b/lapack-netlib/SRC/clanhs.f index 35623b73d..661b4f901 100644 --- a/lapack-netlib/SRC/clanhs.f +++ b/lapack-netlib/SRC/clanhs.f @@ -114,6 +114,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, N @@ -131,14 +132,17 @@ * .. * .. Local Scalars .. INTEGER I, J - REAL SCALE, SUM, VALUE + REAL SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT @@ -190,13 +194,20 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL CLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( N, J+1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANHS = VALUE diff --git a/lapack-netlib/SRC/clansb.f b/lapack-netlib/SRC/clansb.f index fbc50674c..1085fd880 100644 --- a/lapack-netlib/SRC/clansb.f +++ b/lapack-netlib/SRC/clansb.f @@ -135,6 +135,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER K, LDAB, N @@ -152,14 +153,17 @@ * .. * .. Local Scalars .. INTEGER I, J, L - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -227,29 +231,47 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( K.GT.0 ) THEN IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL CLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE L = K + 1 ELSE DO 120 J = 1, N - 1 - CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE L = 1 END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) ELSE L = 1 END IF - CALL CLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANSB = VALUE diff --git a/lapack-netlib/SRC/clansp.f b/lapack-netlib/SRC/clansp.f index fd64366c6..628dc0a75 100644 --- a/lapack-netlib/SRC/clansp.f +++ b/lapack-netlib/SRC/clansp.f @@ -120,6 +120,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER N @@ -137,14 +138,17 @@ * .. * .. Local Scalars .. INTEGER I, J, K - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, REAL, SQRT @@ -219,40 +223,57 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE K = 2 IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL CLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + J 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL CLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* K = 1 + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 I = 1, N IF( REAL( AP( K ) ).NE.ZERO ) THEN ABSA = ABS( REAL( AP( K ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( AIMAG( AP( K ) ).NE.ZERO ) THEN ABSA = ABS( AIMAG( AP( K ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( LSAME( UPLO, 'U' ) ) THEN @@ -261,7 +282,8 @@ K = K + N - I + 1 END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANSP = VALUE diff --git a/lapack-netlib/SRC/clansy.f b/lapack-netlib/SRC/clansy.f index 3aa787410..537fb7ba9 100644 --- a/lapack-netlib/SRC/clansy.f +++ b/lapack-netlib/SRC/clansy.f @@ -128,6 +128,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER LDA, N @@ -145,14 +146,17 @@ * .. * .. Local Scalars .. INTEGER I, J - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT @@ -218,21 +222,39 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL CLASSQ( J-1, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL CLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE END IF - SUM = 2*SUM - CALL CLASSQ( N, A, LDA+1, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANSY = VALUE diff --git a/lapack-netlib/SRC/clantb.f b/lapack-netlib/SRC/clantb.f index 4b4361c79..8066d0ef6 100644 --- a/lapack-netlib/SRC/clantb.f +++ b/lapack-netlib/SRC/clantb.f @@ -146,6 +146,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER K, LDAB, N @@ -164,14 +165,17 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, L - REAL SCALE, SUM, VALUE + REAL SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -313,46 +317,61 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 280 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL CLASSQ( MIN( J-1, K ), - $ AB( MAX( K+2-J, 1 ), J ), 1, SCALE, - $ SUM ) + $ AB( MAX( K+2-J, 1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 280 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 290 J = 1, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL CLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 300 J = 1, N - 1 - CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 310 J = 1, N - CALL CLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANTB = VALUE diff --git a/lapack-netlib/SRC/clantp.f b/lapack-netlib/SRC/clantp.f index 148ac5436..b0c48eb46 100644 --- a/lapack-netlib/SRC/clantp.f +++ b/lapack-netlib/SRC/clantp.f @@ -130,6 +130,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER N @@ -148,14 +149,17 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, K - REAL SCALE, SUM, VALUE + REAL SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT @@ -308,45 +312,64 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 280 J = 2, N - CALL CLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( J-1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + J 280 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 290 J = 1, N - CALL CLASSQ( J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + J 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 300 J = 1, N - 1 - CALL CLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N-J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 300 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 310 J = 1, N - CALL CLASSQ( N-J+1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( N-J+1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANTP = VALUE diff --git a/lapack-netlib/SRC/clantr.f b/lapack-netlib/SRC/clantr.f index 4e1843d3d..3b361cc97 100644 --- a/lapack-netlib/SRC/clantr.f +++ b/lapack-netlib/SRC/clantr.f @@ -147,6 +147,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER LDA, M, N @@ -165,14 +166,17 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J - REAL SCALE, SUM, VALUE + REAL SUM, VALUE +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. * .. External Subroutines .. - EXTERNAL CLASSQ + EXTERNAL CLASSQ, SCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT @@ -283,7 +287,7 @@ END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - DO 210 I = 1, N + DO 210 I = 1, MIN( M, N ) WORK( I ) = ONE 210 CONTINUE DO 220 I = N + 1, M @@ -313,38 +317,56 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 290 J = 2, N - CALL CLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( M, J-1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 300 J = 1, N - CALL CLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( MIN( M, J ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 310 J = 1, N - CALL CLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 320 J = 1, N - CALL CLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL CLASSQ( M-J+1, A( J, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 320 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * CLANTR = VALUE diff --git a/lapack-netlib/SRC/claqps.f b/lapack-netlib/SRC/claqps.f index f47e852a0..d0b7efcd5 100644 --- a/lapack-netlib/SRC/claqps.f +++ b/lapack-netlib/SRC/claqps.f @@ -127,7 +127,7 @@ *> \param[in,out] AUXV *> \verbatim *> AUXV is COMPLEX array, dimension (NB) -*> Auxiliar vector. +*> Auxiliary vector. *> \endverbatim *> *> \param[in,out] F diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f index b61c9f1e9..2f0ea20db 100644 --- a/lapack-netlib/SRC/claqr0.f +++ b/lapack-netlib/SRC/claqr0.f @@ -66,7 +66,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -78,12 +78,12 @@ *> \verbatim *> IHI is INTEGER *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to CGEBAL, and then passed to CGEHRD when the *> matrix output by CGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -95,17 +95,17 @@ *> contains the upper triangular matrix T from the Schur *> decomposition (the Schur form). If INFO = 0 and WANT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] W @@ -127,7 +127,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -137,7 +137,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -145,7 +145,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -158,7 +158,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -174,19 +174,19 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, CLAQR0 failed to compute all of +*> = 0: successful exit +*> > 0: if INFO = i, CLAQR0 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -194,7 +194,7 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -202,7 +202,7 @@ *> where U is the unitary matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -639,7 +639,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/claqr1.f b/lapack-netlib/SRC/claqr1.f index 977947196..87d53871a 100644 --- a/lapack-netlib/SRC/claqr1.f +++ b/lapack-netlib/SRC/claqr1.f @@ -64,7 +64,7 @@ *> \verbatim *> LDH is INTEGER *> The leading dimension of H as declared in -*> the calling procedure. LDH.GE.N +*> the calling procedure. LDH >= N *> \endverbatim *> *> \param[in] S1 diff --git a/lapack-netlib/SRC/claqr2.f b/lapack-netlib/SRC/claqr2.f index 03e9760cf..fc282b2d6 100644 --- a/lapack-netlib/SRC/claqr2.f +++ b/lapack-netlib/SRC/claqr2.f @@ -102,7 +102,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -120,7 +120,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -132,7 +132,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -148,7 +148,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -185,13 +185,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -203,14 +203,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -222,7 +222,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/claqr3.f b/lapack-netlib/SRC/claqr3.f index 660a58376..84d57d4d6 100644 --- a/lapack-netlib/SRC/claqr3.f +++ b/lapack-netlib/SRC/claqr3.f @@ -99,7 +99,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -117,7 +117,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -129,7 +129,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -145,7 +145,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -182,13 +182,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -200,14 +200,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -219,7 +219,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f index 647fa6774..fba286df7 100644 --- a/lapack-netlib/SRC/claqr4.f +++ b/lapack-netlib/SRC/claqr4.f @@ -74,7 +74,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -86,12 +86,12 @@ *> \verbatim *> IHI is INTEGER *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to CGEBAL, and then passed to CGEHRD when the *> matrix output by CGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -103,17 +103,17 @@ *> contains the upper triangular matrix T from the Schur *> decomposition (the Schur form). If INFO = 0 and WANT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] W @@ -135,7 +135,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -145,7 +145,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -153,7 +153,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -166,7 +166,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -182,19 +182,19 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, CLAQR4 failed to compute all of +*> = 0: successful exit +*> > 0: if INFO = i, CLAQR4 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -202,7 +202,7 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -210,7 +210,7 @@ *> where U is the unitary matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -643,7 +643,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index 4c897895d..e4317a3ad 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -125,7 +125,7 @@ *> \verbatim *> LDH is INTEGER *> LDH is the leading dimension of H just as declared in the -*> calling procedure. LDH.GE.MAX(1,N). +*> calling procedure. LDH >= MAX(1,N). *> \endverbatim *> *> \param[in] ILOZ @@ -137,7 +137,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N *> \endverbatim *> *> \param[in,out] Z @@ -153,7 +153,7 @@ *> \verbatim *> LDZ is INTEGER *> LDA is the leading dimension of Z just as declared in -*> the calling procedure. LDZ.GE.N. +*> the calling procedure. LDZ >= N. *> \endverbatim *> *> \param[out] V @@ -165,7 +165,7 @@ *> \verbatim *> LDV is INTEGER *> LDV is the leading dimension of V as declared in the -*> calling procedure. LDV.GE.3. +*> calling procedure. LDV >= 3. *> \endverbatim *> *> \param[out] U @@ -177,33 +177,14 @@ *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU.GE.3*NSHFTS-3. -*> \endverbatim -*> -*> \param[in] NH -*> \verbatim -*> NH is INTEGER -*> NH is the number of columns in array WH available for -*> workspace. NH.GE.1. -*> \endverbatim -*> -*> \param[out] WH -*> \verbatim -*> WH is COMPLEX array, dimension (LDWH,NH) -*> \endverbatim -*> -*> \param[in] LDWH -*> \verbatim -*> LDWH is INTEGER -*> Leading dimension of WH just as declared in the -*> calling procedure. LDWH.GE.3*NSHFTS-3. +*> in the calling subroutine. LDU >= 3*NSHFTS-3. *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> NV is the number of rows in WV agailable for workspace. -*> NV.GE.1. +*> NV >= 1. *> \endverbatim *> *> \param[out] WV @@ -215,9 +196,28 @@ *> \verbatim *> LDWV is INTEGER *> LDWV is the leading dimension of WV as declared in the -*> in the calling subroutine. LDWV.GE.NV. +*> in the calling subroutine. LDWV >= NV. *> \endverbatim * +*> \param[in] NH +*> \verbatim +*> NH is INTEGER +*> NH is the number of columns in array WH available for +*> workspace. NH >= 1. +*> \endverbatim +*> +*> \param[out] WH +*> \verbatim +*> WH is COMPLEX array, dimension (LDWH,NH) +*> \endverbatim +*> +*> \param[in] LDWH +*> \verbatim +*> LDWH is INTEGER +*> Leading dimension of WH just as declared in the +*> calling procedure. LDWH >= 3*NSHFTS-3. +*> \endverbatim +*> * Authors: * ======== * diff --git a/lapack-netlib/SRC/clarfb.f b/lapack-netlib/SRC/clarfb.f index 8fdd5c89c..a4d429c09 100644 --- a/lapack-netlib/SRC/clarfb.f +++ b/lapack-netlib/SRC/clarfb.f @@ -92,6 +92,8 @@ *> K is INTEGER *> The order of the matrix T (= the number of elementary *> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. *> \endverbatim *> *> \param[in] V diff --git a/lapack-netlib/SRC/clarfx.f b/lapack-netlib/SRC/clarfx.f index 1111c80f7..ad284883d 100644 --- a/lapack-netlib/SRC/clarfx.f +++ b/lapack-netlib/SRC/clarfx.f @@ -94,7 +94,7 @@ *> \param[in] LDC *> \verbatim *> LDC is INTEGER -*> The leading dimension of the array C. LDA >= max(1,M). +*> The leading dimension of the array C. LDC >= max(1,M). *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/clarfy.f b/lapack-netlib/SRC/clarfy.f index a5743858c..fccd136a8 100644 --- a/lapack-netlib/SRC/clarfy.f +++ b/lapack-netlib/SRC/clarfy.f @@ -103,7 +103,7 @@ * *> \date December 2016 * -*> \ingroup complex_eig +*> \ingroup complexOTHERauxiliary * * ===================================================================== SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f index 72fe1f948..a45f55ac3 100644 --- a/lapack-netlib/SRC/clarrv.f +++ b/lapack-netlib/SRC/clarrv.f @@ -143,7 +143,7 @@ *> RTOL2 is REAL *> Parameters for bisection. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> \endverbatim *> *> \param[in,out] W diff --git a/lapack-netlib/SRC/classq.f b/lapack-netlib/SRC/classq.f index 28398596f..92e407ff3 100644 --- a/lapack-netlib/SRC/classq.f +++ b/lapack-netlib/SRC/classq.f @@ -41,7 +41,7 @@ *> where x( i ) = abs( X( 1 + ( i - 1 )*INCX ) ). The value of sumsq is *> assumed to be at least unity and the value of ssq will then satisfy *> -*> 1.0 .le. ssq .le. ( sumsq + 2*n ). +*> 1.0 <= ssq <= ( sumsq + 2*n ). *> *> scale is assumed to be non-negative and scl returns the value *> @@ -65,7 +65,7 @@ *> *> \param[in] X *> \verbatim -*> X is COMPLEX array, dimension (N) +*> X is COMPLEX array, dimension (1+(N-1)*INCX) *> The vector x as described above. *> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. *> \endverbatim diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 5fa2276e8..dcbdc0d52 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -1,3 +1,4 @@ +*> \brief \b CLASWLQ * * Definition: * =========== @@ -18,9 +19,20 @@ *> *> \verbatim *> -*> CLASWLQ computes a blocked Short-Wide LQ factorization of a -*> M-by-N matrix A, where N >= M: -*> A = L * Q +*> CLASWLQ computes a blocked Tall-Skinny LQ factorization of +*> a complex M-by-N matrix A for M <= N: +*> +*> A = ( L 0 ) * Q, +*> +*> where: +*> +*> Q is a n-by-N orthogonal matrix, stored on exit in an implicit +*> form in the elements above the digonal of the array A and in +*> the elemenst of the array T; +*> L is an lower-triangular M-by-M matrix stored on exit in +*> the elements on and below the diagonal of the array A. +*> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored. +*> *> \endverbatim * * Arguments: @@ -150,7 +162,7 @@ SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, $ INFO) * -* -- LAPACK computational routine (version 3.7.1) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * June 2017 diff --git a/lapack-netlib/SRC/clasyf_aa.f b/lapack-netlib/SRC/clasyf_aa.f index 1bc96ee1b..a44a8f5b1 100644 --- a/lapack-netlib/SRC/clasyf_aa.f +++ b/lapack-netlib/SRC/clasyf_aa.f @@ -84,7 +84,7 @@ *> *> \param[in,out] A *> \verbatim -*> A is REAL array, dimension (LDA,M) for +*> A is COMPLEX array, dimension (LDA,M) for *> the first panel, while dimension (LDA,M+1) for the *> remaining panels. *> @@ -112,7 +112,7 @@ *> *> \param[in,out] H *> \verbatim -*> H is REAL workspace, dimension (LDH,NB). +*> H is COMPLEX workspace, dimension (LDH,NB). *> *> \endverbatim *> @@ -124,7 +124,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace, dimension (M). +*> WORK is COMPLEX workspace, dimension (M). *> \endverbatim *> * @@ -284,8 +284,9 @@ * * Swap A(I1, I2+1:M) with A(I2, I2+1:M) * - CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, - $ A( J1+I2-1, I2+1 ), LDA ) + IF( I2.LT.M ) + $ CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, + $ A( J1+I2-1, I2+1 ), LDA ) * * Swap A(I1, I1) with A(I2,I2) * @@ -325,13 +326,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( K, J+1 ).NE.ZERO ) THEN - ALPHA = ONE / A( K, J+1 ) - CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) - CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) - ELSE - CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO, - $ A( K, J+2 ), LDA) + IF( J.LT.(M-1) ) THEN + IF( A( K, J+1 ).NE.ZERO ) THEN + ALPHA = ONE / A( K, J+1 ) + CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) + CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) + ELSE + CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO, + $ A( K, J+2 ), LDA) + END IF END IF END IF J = J + 1 @@ -432,8 +435,9 @@ * * Swap A(I2+1:M, I1) with A(I2+1:M, I2) * - CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, - $ A( I2+1, J1+I2-1 ), 1 ) + IF( I2.LT.M ) + $ CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, + $ A( I2+1, J1+I2-1 ), 1 ) * * Swap A(I1, I1) with A(I2, I2) * @@ -473,13 +477,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( J+1, K ).NE.ZERO ) THEN - ALPHA = ONE / A( J+1, K ) - CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) - CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) - ELSE - CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO, - $ A( J+2, K ), LDA ) + IF( J.LT.(M-1) ) THEN + IF( A( J+1, K ).NE.ZERO ) THEN + ALPHA = ONE / A( J+1, K ) + CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) + CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) + ELSE + CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO, + $ A( J+2, K ), LDA ) + END IF END IF END IF J = J + 1 diff --git a/lapack-netlib/SRC/clasyf_rk.f b/lapack-netlib/SRC/clasyf_rk.f index 0700c5cc2..bd7a0fb45 100644 --- a/lapack-netlib/SRC/clasyf_rk.f +++ b/lapack-netlib/SRC/clasyf_rk.f @@ -330,7 +330,7 @@ * of A and working backwards, and compute the matrix W = U12*D * for use in updating A11 * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -658,7 +658,7 @@ * of A and working forwards, and compute the matrix W = L21*D * for use in updating A22 * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/clatdf.f b/lapack-netlib/SRC/clatdf.f index 357f66422..557830d1c 100644 --- a/lapack-netlib/SRC/clatdf.f +++ b/lapack-netlib/SRC/clatdf.f @@ -261,7 +261,7 @@ * * Solve for U- part, lockahead for RHS(N) = +-1. This is not done * In BSOLVE and will hopefully give us a better estimate because -* any ill-conditioning of the original matrix is transfered to U +* any ill-conditioning of the original matrix is transferred to U * and not to L. U(N, N) is an approximation to sigma_min(LU). * CALL CCOPY( N-1, RHS, 1, WORK, 1 ) diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index dab5774c1..e9c6d77c2 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -1,3 +1,4 @@ +*> \brief \b CLATSQR * * Definition: * =========== @@ -18,9 +19,23 @@ *> *> \verbatim *> -*> SLATSQR computes a blocked Tall-Skinny QR factorization of -*> an M-by-N matrix A, where M >= N: -*> A = Q * R . +*> CLATSQR computes a blocked Tall-Skinny QR factorization of +*> a complex M-by-N matrix A for M >= N: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix, stored on exit in an implicit +*> form in the elements below the digonal of the array A and in +*> the elemenst of the array T; +*> +*> R is an upper-triangular N-by-N matrix, stored on exit in +*> the elements on and above the diagonal of the array A. +*> +*> 0 is a (M-N)-by-N zero matrix, and is not stored. +*> *> \endverbatim * * Arguments: @@ -149,10 +164,10 @@ SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, $ LWORK, INFO) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK diff --git a/lapack-netlib/SRC/claunhr_col_getrfnp.f b/lapack-netlib/SRC/claunhr_col_getrfnp.f new file mode 100644 index 000000000..66b9c0407 --- /dev/null +++ b/lapack-netlib/SRC/claunhr_col_getrfnp.f @@ -0,0 +1,248 @@ +*> \brief \b CLAUNHR_COL_GETRFNP +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAUNHR_COL_GETRFNP + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAUNHR_COL_GETRFNP computes the modified LU factorization without +*> pivoting of a complex general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is +*> at least one in absolute value (so that division-by-zero not +*> not possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the blocked right-looking version of the algorithm, +*> calling Level 3 BLAS to update the submatrix. To factorize a block, +*> this routine calls the recursive routine CLAUNHR_COL_GETRFNP2. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is COMPLEX array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be +*> only ( +1.0, 0.0 ) or (-1.0, 0.0 ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complexGEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CLAUNHR_COL_GETRFNP2, CTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLAUNHR_COL_GETRFNP', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + + NB = ILAENV( 1, 'CLAUNHR_COL_GETRFNP', ' ', M, N, -1, -1 ) + + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL CLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + ELSE +* +* Use blocked code. +* + DO J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks. +* + CALL CLAUNHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA, + $ D( J ), IINFO ) +* + IF( J+JB.LE.N ) THEN +* +* Compute block row of U. +* + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, CONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL CGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -CONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, CONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + END DO + END IF + RETURN +* +* End of CLAUNHR_COL_GETRFNP +* + END diff --git a/lapack-netlib/SRC/claunhr_col_getrfnp2.f b/lapack-netlib/SRC/claunhr_col_getrfnp2.f new file mode 100644 index 000000000..82fc329ee --- /dev/null +++ b/lapack-netlib/SRC/claunhr_col_getrfnp2.f @@ -0,0 +1,314 @@ +*> \brief \b CLAUNHR_COL_GETRFNP2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAUNHR_COL_GETRFNP2 + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE CLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAUNHR_COL_GETRFNP2 computes the modified LU factorization without +*> pivoting of a complex general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is at +*> least one in absolute value (so that division-by-zero not +*> possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the recursive version of the LU factorization algorithm. +*> Denote A - S by B. The algorithm divides the matrix B into four +*> submatrices: +*> +*> [ B11 | B12 ] where B11 is n1 by n1, +*> B = [ -----|----- ] B21 is (m-n1) by n1, +*> [ B21 | B22 ] B12 is n1 by n2, +*> B22 is (m-n1) by n2, +*> with n1 = min(m,n)/2, n2 = n-n1. +*> +*> +*> The subroutine calls itself to factor B11, solves for B21, +*> solves for B12, updates B22, then calls itself to factor B22. +*> +*> For more details on the recursive LU algorithm, see [2]. +*> +*> CLAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked +*> routine CLAUNHR_COL_GETRFNP, which uses blocked code calling +*. Level 3 BLAS to update the submatrix. However, CLAUNHR_COL_GETRFNP2 +*> is self-sufficient and can be used without CLAUNHR_COL_GETRFNP. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> +*> [2] "Recursion leads to automatic variable blocking for dense linear +*> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev., +*> vol. 41, no. 6, pp. 737-755, 1997. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is COMPLEX array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be +*> only ( +1.0, 0.0 ) or (-1.0, 0.0 ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complexGEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + RECURSIVE SUBROUTINE CLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + REAL SFMIN + INTEGER I, IINFO, N1, N2 + COMPLEX Z +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CSCAL, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, CMPLX, AIMAG, SIGN, MAX, MIN +* .. +* .. Statement Functions .. + DOUBLE PRECISION CABS1 +* .. +* .. Statement Function definitions .. + CABS1( Z ) = ABS( REAL( Z ) ) + ABS( AIMAG( Z ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLAUNHR_COL_GETRFNP2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN + + IF ( M.EQ.1 ) THEN +* +* One row case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = CMPLX( -SIGN( ONE, REAL( A( 1, 1 ) ) ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* + ELSE IF( N.EQ.1 ) THEN +* +* One column case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = CMPLX( -SIGN( ONE, REAL( A( 1, 1 ) ) ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* +* Scale the elements 2:M of the column +* +* Determine machine safe minimum +* + SFMIN = SLAMCH('S') +* +* Construct the subdiagonal elements of L +* + IF( CABS1( A( 1, 1 ) ) .GE. SFMIN ) THEN + CALL CSCAL( M-1, CONE / A( 1, 1 ), A( 2, 1 ), 1 ) + ELSE + DO I = 2, M + A( I, 1 ) = A( I, 1 ) / A( 1, 1 ) + END DO + END IF +* + ELSE +* +* Divide the matrix B into four submatrices +* + N1 = MIN( M, N ) / 2 + N2 = N-N1 + +* +* Factor B11, recursive call +* + CALL CLAUNHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO ) +* +* Solve for B21 +* + CALL CTRSM( 'R', 'U', 'N', 'N', M-N1, N1, CONE, A, LDA, + $ A( N1+1, 1 ), LDA ) +* +* Solve for B12 +* + CALL CTRSM( 'L', 'L', 'N', 'U', N1, N2, CONE, A, LDA, + $ A( 1, N1+1 ), LDA ) +* +* Update B22, i.e. compute the Schur complement +* B22 := B22 - B21*B12 +* + CALL CGEMM( 'N', 'N', M-N1, N2, N1, -CONE, A( N1+1, 1 ), LDA, + $ A( 1, N1+1 ), LDA, CONE, A( N1+1, N1+1 ), LDA ) +* +* Factor B22, recursive call +* + CALL CLAUNHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA, + $ D( N1+1 ), IINFO ) +* + END IF + RETURN +* +* End of CLAUNHR_COL_GETRFNP2 +* + END diff --git a/lapack-netlib/SRC/cporfsx.f b/lapack-netlib/SRC/cporfsx.f index 872bad36c..3a2db7135 100644 --- a/lapack-netlib/SRC/cporfsx.f +++ b/lapack-netlib/SRC/cporfsx.f @@ -44,7 +44,7 @@ *> \verbatim *> *> CPORFSX improves the computed solution to a system of linear -*> equations when the coefficient matrix is symmetric positive +*> equations when the coefficient matrix is Hermitian positive *> definite, and provides error bounds and backward error estimates *> for the solution. In addition to normwise error bound, the code *> provides maximum componentwise error bound if possible. See @@ -103,7 +103,7 @@ *> \param[in] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> The symmetric matrix A. If UPLO = 'U', the leading N-by-N +*> The Hermitian matrix A. If UPLO = 'U', the leading N-by-N *> upper triangular part of A contains the upper triangular part *> of the matrix A, and the strictly lower triangular part of A *> is not referenced. If UPLO = 'L', the leading N-by-N lower @@ -134,7 +134,7 @@ *> \param[in,out] S *> \verbatim *> S is REAL array, dimension (N) -*> The row scale factors for A. If EQUED = 'Y', A is multiplied on +*> The scale factors for A. If EQUED = 'Y', A is multiplied on *> the left and right by diag(S). S is an input argument if FACT = *> 'F'; otherwise, S is an output argument. If FACT = 'F' and EQUED *> = 'Y', each element of S must be positive. If S is output, each @@ -262,7 +262,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -298,14 +298,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -313,9 +313,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/cposvxx.f b/lapack-netlib/SRC/cposvxx.f index 64d1b67fa..57c2d3feb 100644 --- a/lapack-netlib/SRC/cposvxx.f +++ b/lapack-netlib/SRC/cposvxx.f @@ -45,7 +45,7 @@ *> *> CPOSVXX uses the Cholesky factorization A = U**T*U or A = L*L**T *> to compute the solution to a complex system of linear equations -*> A * X = B, where A is an N-by-N symmetric positive definite matrix +*> A * X = B, where A is an N-by-N Hermitian positive definite matrix *> and X and B are N-by-NRHS matrices. *> *> If requested, both normwise and maximum componentwise error bounds @@ -157,7 +157,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> On entry, the symmetric matrix A, except if FACT = 'F' and EQUED = +*> On entry, the Hermitian matrix A, except if FACT = 'F' and EQUED = *> 'Y', then A must contain the equilibrated matrix *> diag(S)*A*diag(S). If UPLO = 'U', the leading N-by-N upper *> triangular part of A contains the upper triangular part of the @@ -365,7 +365,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -401,14 +401,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -416,9 +416,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/cpotrf2.f b/lapack-netlib/SRC/cpotrf2.f index 789843c41..ed4f12cba 100644 --- a/lapack-netlib/SRC/cpotrf2.f +++ b/lapack-netlib/SRC/cpotrf2.f @@ -24,7 +24,7 @@ *> *> \verbatim *> -*> CPOTRF2 computes the Cholesky factorization of a real symmetric +*> CPOTRF2 computes the Cholesky factorization of a Hermitian *> positive definite matrix A using the recursive algorithm. *> *> The factorization has the form @@ -63,7 +63,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> On entry, the symmetric matrix A. If UPLO = 'U', the leading +*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading *> N-by-N upper triangular part of A contains the upper *> triangular part of the matrix A, and the strictly lower *> triangular part of A is not referenced. If UPLO = 'L', the diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f index 22ac842c9..8fb8131d8 100644 --- a/lapack-netlib/SRC/cstemr.f +++ b/lapack-netlib/SRC/cstemr.f @@ -250,13 +250,13 @@ *> \param[in,out] TRYRAC *> \verbatim *> TRYRAC is LOGICAL -*> If TRYRAC.EQ..TRUE., indicates that the code should check whether +*> If TRYRAC = .TRUE., indicates that the code should check whether *> the tridiagonal matrix defines its eigenvalues to high relative *> accuracy. If so, the code uses relative-accuracy preserving *> algorithms that might be (a bit) slower depending on the matrix. *> If the matrix does not define its eigenvalues to high relative *> accuracy, the code can uses possibly faster algorithms. -*> If TRYRAC.EQ..FALSE., the code is not required to guarantee +*> If TRYRAC = .FALSE., the code is not required to guarantee *> relatively accurate eigenvalues and can use the fastest possible *> techniques. *> On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix diff --git a/lapack-netlib/SRC/csycon_3.f b/lapack-netlib/SRC/csycon_3.f index 47d52dd15..5c1cb0491 100644 --- a/lapack-netlib/SRC/csycon_3.f +++ b/lapack-netlib/SRC/csycon_3.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE CSYCON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND, -* WORK, IWORK, INFO ) +* WORK, INFO ) * * .. Scalar Arguments .. * CHARACTER UPLO @@ -27,7 +27,7 @@ * REAL ANORM, RCOND * .. * .. Array Arguments .. -* INTEGER IPIV( * ), IWORK( * ) +* INTEGER IPIV( * ) * COMPLEX A( LDA, * ), E ( * ), WORK( * ) * .. * @@ -129,11 +129,6 @@ *> WORK is COMPLEX array, dimension (2*N) *> \endverbatim *> -*> \param[out] IWORK -*> \verbatim -*> IWORK is INTEGER array, dimension (N) -*> \endverbatim -*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER diff --git a/lapack-netlib/SRC/csyconvf.f b/lapack-netlib/SRC/csyconvf.f index 77ecf46b5..fd5a5e47f 100644 --- a/lapack-netlib/SRC/csyconvf.f +++ b/lapack-netlib/SRC/csyconvf.f @@ -294,7 +294,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -347,7 +347,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -438,7 +438,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where k increases from 1 to N * I = 1 @@ -491,7 +491,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/csyconvf_rook.f b/lapack-netlib/SRC/csyconvf_rook.f index 1146a97c5..7ede26863 100644 --- a/lapack-netlib/SRC/csyconvf_rook.f +++ b/lapack-netlib/SRC/csyconvf_rook.f @@ -285,7 +285,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -336,7 +336,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -426,7 +426,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where i increases from 1 to N * I = 1 @@ -477,7 +477,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/csyrfsx.f b/lapack-netlib/SRC/csyrfsx.f index 7323ba8eb..4d1bc3ccc 100644 --- a/lapack-netlib/SRC/csyrfsx.f +++ b/lapack-netlib/SRC/csyrfsx.f @@ -271,7 +271,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -307,14 +307,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -322,9 +322,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f index 87be734cc..2081644b1 100644 --- a/lapack-netlib/SRC/csysv_aa.f +++ b/lapack-netlib/SRC/csysv_aa.f @@ -42,7 +42,7 @@ *> matrices. *> *> Aasen's algorithm is used to factor A as -*> A = U * T * U**T, if UPLO = 'U', or +*> A = U**T * T * U, if UPLO = 'U', or *> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric tridiagonal. The factored @@ -75,7 +75,7 @@ *> *> \param[in,out] A *> \verbatim -*> A is REAL array, dimension (LDA,N) +*> A is COMPLEX array, dimension (LDA,N) *> On entry, the symmetric matrix A. If UPLO = 'U', the leading *> N-by-N upper triangular part of A contains the upper *> triangular part of the matrix A, and the strictly lower @@ -86,7 +86,7 @@ *> *> On exit, if INFO = 0, the tridiagonal matrix T and the *> multipliers used to obtain the factor U or L from the -*> factorization A = U*T*U**T or A = L*T*L**T as computed by +*> factorization A = U**T*T*U or A = L*T*L**T as computed by *> CSYTRF. *> \endverbatim *> @@ -106,7 +106,7 @@ *> *> \param[in,out] B *> \verbatim -*> B is REAL array, dimension (LDB,NRHS) +*> B is COMPLEX array, dimension (LDB,NRHS) *> On entry, the N-by-NRHS right hand side matrix B. *> On exit, if INFO = 0, the N-by-NRHS solution matrix X. *> \endverbatim @@ -119,7 +119,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -230,7 +230,7 @@ RETURN END IF * -* Compute the factorization A = U*T*U**T or A = L*T*L**T. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL CSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) IF( INFO.EQ.0 ) THEN diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f index a13349824..c5c328c63 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.f +++ b/lapack-netlib/SRC/csysv_aa_2stage.f @@ -43,8 +43,8 @@ *> matrices. *> *> Aasen's 2-stage algorithm is used to factor A as -*> A = U * T * U**H, if UPLO = 'U', or -*> A = L * T * L**H, if UPLO = 'L', +*> A = U**T * T * U, if UPLO = 'U', or +*> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric and band. The matrix T is *> then LU-factored with partial pivoting. The factored form of A @@ -257,7 +257,7 @@ END IF * * -* Compute the factorization A = U*T*U**H or A = L*T*L**H. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, $ WORK, LWORK, INFO ) diff --git a/lapack-netlib/SRC/csysvxx.f b/lapack-netlib/SRC/csysvxx.f index 2fd2c8771..7a9aee105 100644 --- a/lapack-netlib/SRC/csysvxx.f +++ b/lapack-netlib/SRC/csysvxx.f @@ -378,7 +378,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -414,14 +414,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -429,9 +429,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/csytf2_rk.f b/lapack-netlib/SRC/csytf2_rk.f index 3b5e53a03..7e39c2dfd 100644 --- a/lapack-netlib/SRC/csytf2_rk.f +++ b/lapack-netlib/SRC/csytf2_rk.f @@ -321,7 +321,7 @@ * * Factorize A as U*D*U**T using the upper triangle of A * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -632,7 +632,7 @@ * * Factorize A as L*D*L**T using the lower triangle of A * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f index c389725e9..af913b8f4 100644 --- a/lapack-netlib/SRC/csytrf.f +++ b/lapack-netlib/SRC/csytrf.f @@ -43,7 +43,7 @@ *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and D is symmetric and block diagonal with -*> with 1-by-1 and 2-by-2 diagonal blocks. +*> 1-by-1 and 2-by-2 diagonal blocks. *> *> This is the blocked version of the algorithm, calling Level 3 BLAS. *> \endverbatim diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f index 2f185b0c7..427235bda 100644 --- a/lapack-netlib/SRC/csytrf_aa.f +++ b/lapack-netlib/SRC/csytrf_aa.f @@ -37,7 +37,7 @@ *> CSYTRF_AA computes the factorization of a complex symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a complex symmetric tridiagonal matrix. @@ -63,7 +63,7 @@ *> *> \param[in,out] A *> \verbatim -*> A is REAL array, dimension (LDA,N) +*> A is COMPLEX array, dimension (LDA,N) *> On entry, the symmetric matrix A. If UPLO = 'U', the leading *> N-by-N upper triangular part of A contains the upper *> triangular part of the matrix A, and the strictly lower @@ -94,7 +94,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -223,7 +223,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * * Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N)) @@ -256,7 +256,7 @@ $ A( MAX(1, J), J+1 ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J @@ -375,7 +375,7 @@ $ A( J+1, MAX(1, J) ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f index 0d0bd156c..0946d61b0 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.f +++ b/lapack-netlib/SRC/csytrf_aa_2stage.f @@ -38,7 +38,7 @@ *> CSYTRF_AA_2STAGE computes the factorization of a complex symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a complex symmetric band matrix with the @@ -275,7 +275,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * DO J = 0, NT-1 @@ -448,12 +448,14 @@ c END IF * > Apply pivots to previous columns of L CALL CSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, $ A( (J+1)*NB+1, I2 ), 1 ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA, - $ A( I1+1, I2 ), 1 ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) + $ CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA, + $ A( I1+1, I2 ), 1 ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL CSWAP( N-I2, A( I1, I2+1 ), LDA, - $ A( I2, I2+1 ), LDA ) + IF( I2.LT.N ) + $ CALL CSWAP( N-I2, A( I1, I2+1 ), LDA, + $ A( I2, I2+1 ), LDA ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) @@ -637,11 +639,13 @@ c END IF CALL CSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, $ A( I2, (J+1)*NB+1 ), LDA ) * > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1, - $ A( I2, I1+1 ), LDA ) + IF( I2.GT.(I1+1) ) + $ CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1, + $ A( I2, I1+1 ), LDA ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL CSWAP( N-I2, A( I2+1, I1 ), 1, - $ A( I2+1, I2 ), 1 ) + IF( I2.LT.N ) + $ CALL CSWAP( N-I2, A( I2+1, I1 ), 1, + $ A( I2+1, I2 ), 1 ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) diff --git a/lapack-netlib/SRC/csytri2.f b/lapack-netlib/SRC/csytri2.f index 4bd8e4f99..8bee149c4 100644 --- a/lapack-netlib/SRC/csytri2.f +++ b/lapack-netlib/SRC/csytri2.f @@ -62,7 +62,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers +*> On entry, the block diagonal matrix D and the multipliers *> used to obtain the factor U or L as computed by CSYTRF. *> *> On exit, if INFO = 0, the (symmetric) inverse of the original @@ -82,7 +82,7 @@ *> \param[in] IPIV *> \verbatim *> IPIV is INTEGER array, dimension (N) -*> Details of the interchanges and the NB structure of D +*> Details of the interchanges and the block structure of D *> as determined by CSYTRF. *> \endverbatim *> diff --git a/lapack-netlib/SRC/csytrs2.f b/lapack-netlib/SRC/csytrs2.f index 1002b5461..93f2d6a1b 100644 --- a/lapack-netlib/SRC/csytrs2.f +++ b/lapack-netlib/SRC/csytrs2.f @@ -36,7 +36,7 @@ *> *> \verbatim *> -*> CSYTRS2 solves a system of linear equations A*X = B with a COMPLEX +*> CSYTRS2 solves a system of linear equations A*X = B with a complex *> symmetric matrix A using the factorization A = U*D*U**T or *> A = L*D*L**T computed by CSYTRF and converted by CSYCONV. *> \endverbatim diff --git a/lapack-netlib/SRC/csytrs_aa.f b/lapack-netlib/SRC/csytrs_aa.f index 7cf950492..981f8722a 100644 --- a/lapack-netlib/SRC/csytrs_aa.f +++ b/lapack-netlib/SRC/csytrs_aa.f @@ -37,7 +37,7 @@ *> \verbatim *> *> CSYTRS_AA solves a system of linear equations A*X = B with a complex -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by CSYTRF_AA. *> \endverbatim * @@ -49,7 +49,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -68,7 +68,7 @@ *> *> \param[in] A *> \verbatim -*> A is REAL array, dimension (LDA,N) +*> A is COMPLEX array, dimension (LDA,N) *> Details of factors computed by CSYTRF_AA. *> \endverbatim *> @@ -86,7 +86,7 @@ *> *> \param[in,out] B *> \verbatim -*> B is REAL array, dimension (LDB,NRHS) +*> B is COMPLEX array, dimension (LDB,NRHS) *> On entry, the right hand side matrix B. *> On exit, the solution matrix X. *> \endverbatim @@ -97,14 +97,16 @@ *> The leading dimension of the array B. LDB >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim -*> WORK is DOUBLE array, dimension (MAX(1,LWORK)) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER, LWORK >= MAX(1,3*N-2). +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -198,22 +200,29 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. +* +* 1) Forward substitution with U**T +* + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B * -* Pivot, P**T * B + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Compute U**T \ B -> B [ (U**T \P**T * B) ] * -* Compute (U \P**T * B) -> B [ (U \P**T * B) ] + CALL CTRSM( 'L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) + END IF * - CALL CTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) +* 2) Solve with triangular matrix T * -* Compute T \ B -> B [ T \ (U \P**T * B) ] +* Compute T \ B -> B [ T \ (U**T \P**T * B) ] * CALL CLACPY( 'F', 1, N, A( 1, 1 ), LDA+1, WORK( N ), 1) IF( N.GT.1 ) THEN @@ -223,35 +232,48 @@ CALL CGTSV( N, NRHS, WORK( 1 ), WORK( N ), WORK( 2*N ), B, LDB, $ INFO ) * -* Compute (U**T \ B) -> B [ U**T \ (T \ (U \P**T * B) ) ] +* 3) Backward substitution with U +* + IF( N.GT.1 ) THEN * - CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) +* Compute U \ B -> B [ U \ (T \ (U**T \P**T * B) ) ] * -* Pivot, P * B [ P * (U**T \ (T \ (U \P**T * B) )) ] + CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Pivot, P * B -> B [ P * (U**T \ (T \ (U \P**T * B) )) ] +* + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * ELSE * * Solve A*X = B, where A = L*T*L**T. * -* Pivot, P**T * B +* 1) Forward substitution with L * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B +* + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO +* +* Compute L \ B -> B [ (L \P**T * B) ] +* + CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) + END IF * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* 2) Solve with triangular matrix T * - CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) * * Compute T \ B -> B [ T \ (L \P**T * B) ] * @@ -263,18 +285,23 @@ CALL CGTSV( N, NRHS, WORK( 1 ), WORK(N), WORK( 2*N ), B, LDB, $ INFO) * -* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] +* 3) Backward substitution with L**T +* + IF( N.GT.1 ) THEN * - CALL CTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) +* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] + CALL CTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * END IF * diff --git a/lapack-netlib/SRC/csytrs_aa_2stage.f b/lapack-netlib/SRC/csytrs_aa_2stage.f index d025c08fe..581910933 100644 --- a/lapack-netlib/SRC/csytrs_aa_2stage.f +++ b/lapack-netlib/SRC/csytrs_aa_2stage.f @@ -36,7 +36,7 @@ *> \verbatim *> *> CSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a complex -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by CSYTRF_AA_2STAGE. *> \endverbatim * @@ -48,7 +48,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -208,15 +208,15 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (U**T \P**T * B) -> B [ (U**T \P**T * B) ] +* Compute (U**T \ B) -> B [ (U**T \P**T * B) ] * CALL CTRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) @@ -234,7 +234,7 @@ CALL CTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (U \ (T \ (U**T \P**T * B) )) ] +* Pivot, P * B -> B [ P * (U \ (T \ (U**T \P**T * B) )) ] * CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * @@ -246,11 +246,11 @@ * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute (L \ B) -> B [ (L \P**T * B) ] * CALL CTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) @@ -268,7 +268,7 @@ CALL CTRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] * CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * diff --git a/lapack-netlib/SRC/ctgsy2.f b/lapack-netlib/SRC/ctgsy2.f index 66a8980d0..5ccdfb1e1 100644 --- a/lapack-netlib/SRC/ctgsy2.f +++ b/lapack-netlib/SRC/ctgsy2.f @@ -67,7 +67,7 @@ *> R * B**H + L * E**H = scale * -F *> *> This case is used to compute an estimate of Dif[(A, D), (B, E)] = -*> = sigma_min(Z) using reverse communicaton with CLACON. +*> = sigma_min(Z) using reverse communication with CLACON. *> *> CTGSY2 also (IJOB >= 1) contributes to the computation in CTGSYL *> of an upper bound on the separation between to matrix pairs. Then @@ -81,7 +81,7 @@ *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 -*> = 'N', solve the generalized Sylvester equation (1). +*> = 'N': solve the generalized Sylvester equation (1). *> = 'T': solve the 'transposed' system (3). *> \endverbatim *> @@ -89,13 +89,13 @@ *> \verbatim *> IJOB is INTEGER *> Specifies what kind of functionality to be performed. -*> =0: solve (1) only. -*> =1: A contribution from this subsystem to a Frobenius -*> norm-based estimate of the separation between two matrix -*> pairs is computed. (look ahead strategy is used). -*> =2: A contribution from this subsystem to a Frobenius -*> norm-based estimate of the separation between two matrix -*> pairs is computed. (SGECON on sub-systems is used.) +*> = 0: solve (1) only. +*> = 1: A contribution from this subsystem to a Frobenius +*> norm-based estimate of the separation between two matrix +*> pairs is computed. (look ahead strategy is used). +*> = 2: A contribution from this subsystem to a Frobenius +*> norm-based estimate of the separation between two matrix +*> pairs is computed. (SGECON on sub-systems is used.) *> Not referenced if TRANS = 'T'. *> \endverbatim *> diff --git a/lapack-netlib/SRC/ctplqt.f b/lapack-netlib/SRC/ctplqt.f index cb4d419b9..39893df48 100644 --- a/lapack-netlib/SRC/ctplqt.f +++ b/lapack-netlib/SRC/ctplqt.f @@ -1,3 +1,5 @@ +*> \brief \b CTPLQT +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/ctplqt2.f b/lapack-netlib/SRC/ctplqt2.f index b16d6149a..d18452aec 100644 --- a/lapack-netlib/SRC/ctplqt2.f +++ b/lapack-netlib/SRC/ctplqt2.f @@ -1,3 +1,5 @@ +*> \brief \b CTPLQT2 +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/ctpmlqt.f b/lapack-netlib/SRC/ctpmlqt.f index cb5f033ca..5899a5335 100644 --- a/lapack-netlib/SRC/ctpmlqt.f +++ b/lapack-netlib/SRC/ctpmlqt.f @@ -1,3 +1,5 @@ +*> \brief \b CTPMLQT +* * Definition: * =========== * @@ -77,7 +79,7 @@ *> *> \param[in] V *> \verbatim -*> V is COMPLEX array, dimension (LDA,K) +*> V is COMPLEX array, dimension (LDV,K) *> The i-th row must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> DTPLQT in B. See Further Details. diff --git a/lapack-netlib/SRC/ctpmqrt.f b/lapack-netlib/SRC/ctpmqrt.f index fd3d1b109..8d4a36ca8 100644 --- a/lapack-netlib/SRC/ctpmqrt.f +++ b/lapack-netlib/SRC/ctpmqrt.f @@ -94,7 +94,7 @@ *> *> \param[in] V *> \verbatim -*> V is COMPLEX array, dimension (LDA,K) +*> V is COMPLEX array, dimension (LDV,K) *> The i-th column must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> CTPQRT in B. See Further Details. diff --git a/lapack-netlib/SRC/ctprfb.f b/lapack-netlib/SRC/ctprfb.f index 1538deb56..0f45edaf8 100644 --- a/lapack-netlib/SRC/ctprfb.f +++ b/lapack-netlib/SRC/ctprfb.f @@ -152,8 +152,8 @@ *> \verbatim *> LDA is INTEGER *> The leading dimension of the array A. -*> If SIDE = 'L', LDC >= max(1,K); -*> If SIDE = 'R', LDC >= max(1,M). +*> If SIDE = 'L', LDA >= max(1,K); +*> If SIDE = 'R', LDA >= max(1,M). *> \endverbatim *> *> \param[in,out] B diff --git a/lapack-netlib/SRC/cungtsqr.f b/lapack-netlib/SRC/cungtsqr.f new file mode 100644 index 000000000..bc5305cf9 --- /dev/null +++ b/lapack-netlib/SRC/cungtsqr.f @@ -0,0 +1,307 @@ +*> \brief \b CUNGTSQR +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CUNGTSQR + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE CUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, +* $ INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGTSQR generates an M-by-N complex matrix Q_out with orthonormal +*> columns, which are the first N columns of a product of comlpex unitary +*> matrices of order M which are returned by CLATSQR +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> See the documentation for CLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by DLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by CLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not accessed. +*> The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by CLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored) (same format as the output A +*> below the diagonal in CLATSQR). +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of NIRB block reflector sequences +*> is stored in a larger NB-by-N column block of T and consists +*> of NICB smaller NB-by-NB upper-triangular column blocks. +*> (same format as the output T in CLATSQR). +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB1,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX array, dimension (MAX(2,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. LWORK >= (M+NB)*N. +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup comlexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CLAMTSQR, CLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + LQUERY = LWORK.EQ.-1 + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array C(LDC, N) and WORK(LWORK) +* in the call to CLAMTSQR. See the documentation for CLAMTSQR. +* + IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN + INFO = -10 + ELSE +* +* Set block size for column blocks +* + NBLOCAL = MIN( NB, N ) +* +* LWORK = -1, then set the size for the array C(LDC,N) +* in CLAMTSQR call and set the optimal size of the work array +* WORK(LWORK) in CLAMTSQR call. +* + LDC = M + LC = LDC*N + LW = N * NBLOCAL +* + LWORKOPT = LC+LW +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -10 + END IF + END IF +* + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CUNGTSQR', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in +* of M-by-M orthogonal matrix Q_in, which is implicitly stored in +* the subdiagonal part of input array A and in the input array T. +* Perform by the following operation using the routine CLAMTSQR. +* +* Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix, +* ( 0 ) 0 is a (M-N)-by-N zero matrix. +* +* (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones +* on the diagonal and zeros elsewhere. +* + CALL CLASET( 'F', M, N, CZERO, CONE, WORK, LDC ) +* +* (1b) On input, WORK(1:LDC*N) stores ( I ); +* ( 0 ) +* +* On output, WORK(1:LDC*N) stores Q1_in. +* + CALL CLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT, + $ WORK, LDC, WORK( LC+1 ), LW, IINFO ) +* +* (2) Copy the result from the part of the work array (1:M,1:N) +* with the leading dimension LDC that starts at WORK(1) into +* the output array A(1:M,1:N) column-by-column. +* + DO J = 1, N + CALL CCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 ) + END DO +* + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN +* +* End of CUNGTSQR +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/cunhr_col.f b/lapack-netlib/SRC/cunhr_col.f new file mode 100644 index 000000000..15c31491e --- /dev/null +++ b/lapack-netlib/SRC/cunhr_col.f @@ -0,0 +1,441 @@ +*> \brief \b CUNHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CUNHR_COL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE CUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNHR_COL takes an M-by-N complex matrix Q_in with orthonormal columns +*> as input, stored in A, and performs Householder Reconstruction (HR), +*> i.e. reconstructs Householder vectors V(i) implicitly representing +*> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S, +*> where S is an N-by-N diagonal matrix with diagonal entries +*> equal to +1 or -1. The Householder vectors (columns V(i) of V) are +*> stored in A on output, and the diagonal entries of S are stored in D. +*> Block reflectors are also returned in T +*> (same output format as CGEQRT). +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size to be used in the reconstruction +*> of Householder column vector blocks in the array A and +*> corresponding block reflectors in the array T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size.) +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> +*> The array A contains an M-by-N orthonormal matrix Q_in, +*> i.e the columns of A are orthogonal unit vectors. +*> +*> On exit: +*> +*> The elements below the diagonal of A represent the unit +*> lower-trapezoidal matrix V of Householder column vectors +*> V(i). The unit diagonal entries of V are not stored +*> (same format as the output below the diagonal in A from +*> CGEQRT). The matrix T and the matrix V stored on output +*> in A implicitly define Q_out. +*> +*> The elements above the diagonal contain the factor U +*> of the "modified" LU-decomposition: +*> Q_in - ( S ) = V * U +*> ( 0 ) +*> where 0 is a (M-N)-by-(M-N) zero matrix. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX array, +*> dimension (LDT, N) +*> +*> Let NOCB = Number_of_output_col_blocks +*> = CEIL(N/NB) +*> +*> On exit, T(1:NB, 1:N) contains NOCB upper-triangular +*> block reflectors used to define Q_out stored in compact +*> form as a sequence of upper-triangular NB-by-NB column +*> blocks (same format as the output T in CGEQRT). +*> The matrix T and the matrix V stored on output in A +*> implicitly define Q_out. NOTE: The lower triangles +*> below the upper-triangular blcoks will be filled with +*> zeros. See Further Details. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is COMPLEX array, dimension min(M,N). +*> The elements can be only plus or minus one. +*> +*> D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where +*> 1 <= i <= min(M,N), and Q_in_i is Q_in after performing +*> i-1 steps of “modified” Gaussian elimination. +*> See Further Details. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The computed M-by-M unitary factor Q_out is defined implicitly as +*> a product of unitary matrices Q_out(i). Each Q_out(i) is stored in +*> the compact WY-representation format in the corresponding blocks of +*> matrices V (stored in A) and T. +*> +*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N +*> matrix A contains the column vectors V(i) in NB-size column +*> blocks VB(j). For example, VB(1) contains the columns +*> V(1), V(2), ... V(NB). NOTE: The unit entries on +*> the diagonal of Y are not stored in A. +*> +*> The number of column blocks is +*> +*> NOCB = Number_of_output_col_blocks = CEIL(N/NB) +*> +*> where each block is of order NB except for the last block, which +*> is of order LAST_NB = N - (NOCB-1)*NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix V is +*> +*> +*> V = ( VB(1), VB(2), VB(3) ) = +*> +*> = ( 1 ) +*> ( v21 1 ) +*> ( v31 v32 1 ) +*> ( v41 v42 v43 1 ) +*> ( v51 v52 v53 v54 1 ) +*> ( v61 v62 v63 v54 v65 ) +*> +*> +*> For each of the column blocks VB(i), an upper-triangular block +*> reflector TB(i) is computed. These blocks are stored as +*> a sequence of upper-triangular column blocks in the NB-by-N +*> matrix T. The size of each TB(i) block is NB-by-NB, except +*> for the last block, whose size is LAST_NB-by-LAST_NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix T is +*> +*> T = ( TB(1), TB(2), TB(3) ) = +*> +*> = ( t11 t12 t13 t14 t15 ) +*> ( t22 t24 ) +*> +*> +*> The M-by-M factor Q_out is given as a product of NOCB +*> unitary M-by-M matrices Q_out(i). +*> +*> Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB), +*> +*> where each matrix Q_out(i) is given by the WY-representation +*> using corresponding blocks from the matrices V and T: +*> +*> Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T, +*> +*> where I is the identity matrix. Here is the formula with matrix +*> dimensions: +*> +*> Q(i){M-by-M} = I{M-by-M} - +*> VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M}, +*> +*> where INB = NB, except for the last block NOCB +*> for which INB=LAST_NB. +*> +*> ===== +*> NOTE: +*> ===== +*> +*> If Q_in is the result of doing a QR factorization +*> B = Q_in * R_in, then: +*> +*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out. +*> +*> So if one wants to interpret Q_out as the result +*> of the QR factorization of B, then corresponding R_out +*> should be obtained by R_out = S * R_in, i.e. some rows of R_in +*> should be multiplied by -1. +*> +*> For the details of the algorithm, see [1]. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB, + $ NPLUSONE +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CLAUNHR_COL_GETRFNP, CSCAL, CTRSM, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. N.GT.M ) THEN + INFO = -2 + ELSE IF( NB.LT.1 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -5 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -7 + END IF +* +* Handle error in the input parameters. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CUNHR_COL', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + RETURN + END IF +* +* On input, the M-by-N matrix A contains the unitary +* M-by-N matrix Q_in. +* +* (1) Compute the unit lower-trapezoidal V (ones on the diagonal +* are not stored) by performing the "modified" LU-decomposition. +* +* Q_in - ( S ) = V * U = ( V1 ) * U, +* ( 0 ) ( V2 ) +* +* where 0 is an (M-N)-by-N zero matrix. +* +* (1-1) Factor V1 and U. + + CALL CLAUNHR_COL_GETRFNP( N, N, A, LDA, D, IINFO ) +* +* (1-2) Solve for V2. +* + IF( M.GT.N ) THEN + CALL CTRSM( 'R', 'U', 'N', 'N', M-N, N, CONE, A, LDA, + $ A( N+1, 1 ), LDA ) + END IF +* +* (2) Reconstruct the block reflector T stored in T(1:NB, 1:N) +* as a sequence of upper-triangular blocks with NB-size column +* blocking. +* +* Loop over the column blocks of size NB of the array A(1:M,1:N) +* and the array T(1:NB,1:N), JB is the column index of a column +* block, JNB is the column block size at each step JB. +* + NPLUSONE = N + 1 + DO JB = 1, N, NB +* +* (2-0) Determine the column block size JNB. +* + JNB = MIN( NPLUSONE-JB, NB ) +* +* (2-1) Copy the upper-triangular part of the current JNB-by-JNB +* diagonal block U(JB) (of the N-by-N matrix U) stored +* in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part +* of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1) +* column-by-column, total JNB*(JNB+1)/2 elements. +* + JBTEMP1 = JB - 1 + DO J = JB, JB+JNB-1 + CALL CCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 ) + END DO +* +* (2-2) Perform on the upper-triangular part of the current +* JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored +* in T(1:JNB,JB:JB+JNB-1) the following operation in place: +* (-1)*U(JB)*S(JB), i.e the result will be stored in the upper- +* triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication +* of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB +* diagonal block S(JB) of the N-by-N sign matrix S from the +* right means changing the sign of each J-th column of the block +* U(JB) according to the sign of the diagonal element of the block +* S(JB), i.e. S(J,J) that is stored in the array element D(J). +* + DO J = JB, JB+JNB-1 + IF( D( J ).EQ.CONE ) THEN + CALL CSCAL( J-JBTEMP1, -CONE, T( 1, J ), 1 ) + END IF + END DO +* +* (2-3) Perform the triangular solve for the current block +* matrix X(JB): +* +* X(JB) * (A(JB)**T) = B(JB), where: +* +* A(JB)**T is a JNB-by-JNB unit upper-triangular +* coefficient block, and A(JB)=V1(JB), which +* is a JNB-by-JNB unit lower-triangular block +* stored in A(JB:JB+JNB-1,JB:JB+JNB-1). +* The N-by-N matrix V1 is the upper part +* of the M-by-N lower-trapezoidal matrix V +* stored in A(1:M,1:N); +* +* B(JB) is a JNB-by-JNB upper-triangular right-hand +* side block, B(JB) = (-1)*U(JB)*S(JB), and +* B(JB) is stored in T(1:JNB,JB:JB+JNB-1); +* +* X(JB) is a JNB-by-JNB upper-triangular solution +* block, X(JB) is the upper-triangular block +* reflector T(JB), and X(JB) is stored +* in T(1:JNB,JB:JB+JNB-1). +* +* In other words, we perform the triangular solve for the +* upper-triangular block T(JB): +* +* T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB). +* +* Even though the blocks X(JB) and B(JB) are upper- +* triangular, the routine CTRSM will access all JNB**2 +* elements of the square T(1:JNB,JB:JB+JNB-1). Therefore, +* we need to set to zero the elements of the block +* T(1:JNB,JB:JB+JNB-1) below the diagonal before the call +* to CTRSM. +* +* (2-3a) Set the elements to zero. +* + JBTEMP2 = JB - 2 + DO J = JB, JB+JNB-2 + DO I = J-JBTEMP2, NB + T( I, J ) = CZERO + END DO + END DO +* +* (2-3b) Perform the triangular solve. +* + CALL CTRSM( 'R', 'L', 'C', 'U', JNB, JNB, CONE, + $ A( JB, JB ), LDA, T( 1, JB ), LDT ) +* + END DO +* + RETURN +* +* End of CUNHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/dbdsqr.f b/lapack-netlib/SRC/dbdsqr.f index 93db95e7a..7d47fa282 100644 --- a/lapack-netlib/SRC/dbdsqr.f +++ b/lapack-netlib/SRC/dbdsqr.f @@ -166,7 +166,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (4*N) +*> WORK is DOUBLE PRECISION array, dimension (4*(N-1)) *> \endverbatim *> *> \param[out] INFO diff --git a/lapack-netlib/SRC/dbdsvdx.f b/lapack-netlib/SRC/dbdsvdx.f index 96fdb3d61..10d97a71f 100644 --- a/lapack-netlib/SRC/dbdsvdx.f +++ b/lapack-netlib/SRC/dbdsvdx.f @@ -165,7 +165,7 @@ *> *> \param[out] Z *> \verbatim -*> Z is DOUBLE PRECISION array, dimension (2*N,K) ) +*> Z is DOUBLE PRECISION array, dimension (2*N,K) *> If JOBZ = 'V', then if INFO = 0 the first NS columns of Z *> contain the singular vectors of the matrix B corresponding to *> the selected singular values, with U in rows 1 to N and V diff --git a/lapack-netlib/SRC/dcombssq.f b/lapack-netlib/SRC/dcombssq.f new file mode 100644 index 000000000..79f6d95c9 --- /dev/null +++ b/lapack-netlib/SRC/dcombssq.f @@ -0,0 +1,92 @@ +*> \brief \b DCOMBSSQ adds two scaled sum of squares quantities. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE DCOMBSSQ( V1, V2 ) +* +* .. Array Arguments .. +* DOUBLE PRECISION V1( 2 ), V2( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCOMBSSQ adds two scaled sum of squares quantities, V1 := V1 + V2. +*> That is, +*> +*> V1_scale**2 * V1_sumsq := V1_scale**2 * V1_sumsq +*> + V2_scale**2 * V2_sumsq +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in,out] V1 +*> \verbatim +*> V1 is DOUBLE PRECISION array, dimension (2). +*> The first scaled sum. +*> V1(1) = V1_scale, V1(2) = V1_sumsq. +*> \endverbatim +*> +*> \param[in] V2 +*> \verbatim +*> V2 is DOUBLE PRECISION array, dimension (2). +*> The second scaled sum. +*> V2(1) = V2_scale, V2(2) = V2_sumsq. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2018 +* +*> \ingroup OTHERauxiliary +* +* ===================================================================== + SUBROUTINE DCOMBSSQ( V1, V2 ) +* +* -- LAPACK auxiliary routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2018 +* +* .. Array Arguments .. + DOUBLE PRECISION V1( 2 ), V2( 2 ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* + IF( V1( 1 ).GE.V2( 1 ) ) THEN + IF( V1( 1 ).NE.ZERO ) THEN + V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) + END IF + ELSE + V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) + V1( 1 ) = V2( 1 ) + END IF + RETURN +* +* End of DCOMBSSQ +* + END diff --git a/lapack-netlib/SRC/dgbrfsx.f b/lapack-netlib/SRC/dgbrfsx.f index fb52d643f..76afb2d6a 100644 --- a/lapack-netlib/SRC/dgbrfsx.f +++ b/lapack-netlib/SRC/dgbrfsx.f @@ -308,7 +308,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -344,14 +344,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension (NPARAMS) -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -359,9 +359,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/dgbsvxx.f b/lapack-netlib/SRC/dgbsvxx.f index 819d20c6d..058b20686 100644 --- a/lapack-netlib/SRC/dgbsvxx.f +++ b/lapack-netlib/SRC/dgbsvxx.f @@ -431,7 +431,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -467,14 +467,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension (NPARAMS) -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -482,9 +482,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/dgebak.f b/lapack-netlib/SRC/dgebak.f index 45a86ee57..10a78aa1a 100644 --- a/lapack-netlib/SRC/dgebak.f +++ b/lapack-netlib/SRC/dgebak.f @@ -47,10 +47,10 @@ *> \verbatim *> JOB is CHARACTER*1 *> Specifies the type of backward transformation required: -*> = 'N', do nothing, return immediately; -*> = 'P', do backward transformation for permutation only; -*> = 'S', do backward transformation for scaling only; -*> = 'B', do backward transformations for both permutation and +*> = 'N': do nothing, return immediately; +*> = 'P': do backward transformation for permutation only; +*> = 'S': do backward transformation for scaling only; +*> = 'B': do backward transformations for both permutation and *> scaling. *> JOB must be the same as the argument JOB supplied to DGEBAL. *> \endverbatim diff --git a/lapack-netlib/SRC/dgeesx.f b/lapack-netlib/SRC/dgeesx.f index 26042a5f9..a08104d3d 100644 --- a/lapack-netlib/SRC/dgeesx.f +++ b/lapack-netlib/SRC/dgeesx.f @@ -583,7 +583,9 @@ IF( N.GT.I+1 ) $ CALL DSWAP( N-I-1, A( I, I+2 ), LDA, $ A( I+1, I+2 ), LDA ) - CALL DSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 ) + IF( WANTVS ) THEN + CALL DSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 ) + END IF A( I, I+1 ) = A( I+1, I ) A( I+1, I ) = ZERO END IF diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f index 25ed248d0..a30cfab87 100644 --- a/lapack-netlib/SRC/dgejsv.f +++ b/lapack-netlib/SRC/dgejsv.f @@ -82,7 +82,7 @@ *> desirable, then this option is advisable. The input matrix A *> is preprocessed with QR factorization with FULL (row and *> column) pivoting. -*> = 'G' Computation as with 'F' with an additional estimate of the +*> = 'G': Computation as with 'F' with an additional estimate of the *> condition number of B, where A=D*B. If A has heavily weighted *> rows, then using this condition number gives too pessimistic *> error bound. @@ -133,7 +133,7 @@ *> specified range. If A .NE. 0 is scaled so that the largest singular *> value of c*A is around DSQRT(BIG), BIG=SLAMCH('O'), then JOBR issues *> the licence to kill columns of A whose norm in c*A is less than -*> DSQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN, +*> DSQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, *> where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E'). *> = 'N': Do not kill small columns of c*A. This option assumes that *> BLAS and QR factorizations and triangular solvers are @@ -230,7 +230,7 @@ *> If JOBU = 'F', then U contains on exit the M-by-M matrix of *> the left singular vectors, including an ONB *> of the orthogonal complement of the Range(A). -*> If JOBU = 'W' .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N), +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), *> then U is used as workspace if the procedure *> replaces A with A^t. In that case, [V] is computed *> in U as left singular vectors of A^t and then @@ -252,7 +252,7 @@ *> V is DOUBLE PRECISION array, dimension ( LDV, N ) *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; -*> If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N), +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), *> then V is used as workspace if the pprocedure *> replaces A with A^t. In that case, [U] is computed *> in V as right singular vectors of A^t and then @@ -272,13 +272,13 @@ *> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (LWORK) -*> On exit, if N.GT.0 .AND. M.GT.0 (else not referenced), +*> On exit, if N > 0 .AND. M > 0 (else not referenced), *> WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such *> that SCALE*SVA(1:N) are the computed singular values *> of A. (See the description of SVA().) *> WORK(2) = See the description of WORK(1). *> WORK(3) = SCONDA is an estimate for the condition number of -*> column equilibrated A. (If JOBA .EQ. 'E' or 'G') +*> column equilibrated A. (If JOBA = 'E' or 'G') *> SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1). *> It is computed using DPOCON. It holds *> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA @@ -297,7 +297,7 @@ *> triangular factor in the first QR factorization. *> WORK(5) = an estimate of the scaled condition number of the *> triangular factor in the second QR factorization. -*> The following two parameters are computed if JOBT .EQ. 'T'. +*> The following two parameters are computed if JOBT = 'T'. *> They are provided for a developer/implementer who is familiar *> with the details of the method. *> @@ -313,8 +313,8 @@ *> Length of WORK to confirm proper allocation of work space. *> LWORK depends on the job: *> -*> If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and -*> -> .. no scaled condition estimate required (JOBE.EQ.'N'): +*> If only SIGMA is needed (JOBU = 'N', JOBV = 'N') and +*> -> .. no scaled condition estimate required (JOBE = 'N'): *> LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement. *> ->> For optimal performance (blocked code) the optimal value *> is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal @@ -330,7 +330,7 @@ *> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), *> N+N*N+LWORK(DPOCON),7). *> -*> If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'), +*> If SIGMA and the right singular vectors are needed (JOBV = 'V'), *> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). *> -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7), *> where NB is the optimal block size for DGEQP3, DGEQRF, DGELQF, @@ -341,19 +341,19 @@ *> If SIGMA and the left singular vectors are needed *> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). *> -> For optimal performance: -*> if JOBU.EQ.'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7), -*> if JOBU.EQ.'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7), +*> if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7), +*> if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7), *> where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR. *> In general, the optimal length LWORK is computed as *> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON), *> 2*N+LWORK(DGEQRF), N+LWORK(DORMQR)). -*> Here LWORK(DORMQR) equals N*NB (for JOBU.EQ.'U') or -*> M*NB (for JOBU.EQ.'F'). +*> Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or +*> M*NB (for JOBU = 'F'). *> -*> If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and -*> -> if JOBV.EQ.'V' +*> If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> -> if JOBV = 'V' *> the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N). -*> -> if JOBV.EQ.'J' the minimal requirement is +*> -> if JOBV = 'J' the minimal requirement is *> LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6). *> -> For optimal performance, LWORK should be additionally *> larger than N+M*NB, where NB is the optimal block size @@ -369,7 +369,7 @@ *> of JOBA and JOBR. *> IWORK(2) = the number of the computed nonzero singular values *> IWORK(3) = if nonzero, a warning message: -*> If IWORK(3).EQ.1 then some of the column norms of A +*> If IWORK(3) = 1 then some of the column norms of A *> were denormalized floats. The requested high accuracy *> is not warranted by the data. *> \endverbatim @@ -377,10 +377,10 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> < 0 : if INFO = -i, then the i-th argument had an illegal value. -*> = 0 : successful exit; -*> > 0 : DGEJSV did not converge in the maximal allowed number -*> of sweeps. The computed values may be inaccurate. +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: DGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. *> \endverbatim * * Authors: @@ -953,7 +953,7 @@ IF ( L2ABER ) THEN * Standard absolute error bound suffices. All sigma_i with * sigma_i < N*EPSLN*||A|| are flushed to zero. This is an -* agressive enforcement of lower numerical rank by introducing a +* aggressive enforcement of lower numerical rank by introducing a * backward error of the order of N*EPSLN*||A||. TEMP1 = DSQRT(DBLE(N))*EPSLN DO 3001 p = 2, N @@ -965,7 +965,7 @@ 3001 CONTINUE 3002 CONTINUE ELSE IF ( L2RANK ) THEN -* .. similarly as above, only slightly more gentle (less agressive). +* .. similarly as above, only slightly more gentle (less aggressive). * Sudden drop on the diagonal of R1 is used as the criterion for * close-to-rank-deficient. TEMP1 = DSQRT(SFMIN) @@ -1294,7 +1294,7 @@ CALL DPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1, $ WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR) CONDR1 = ONE / DSQRT(TEMP1) -* .. here need a second oppinion on the condition number +* .. here need a second opinion on the condition number * .. then assume worst case scenario * R1 is OK for inverse <=> CONDR1 .LT. DBLE(N) * more conservative <=> CONDR1 .LT. DSQRT(DBLE(N)) @@ -1335,7 +1335,7 @@ ELSE * * .. ill-conditioned case: second QRF with pivoting -* Note that windowed pivoting would be equaly good +* Note that windowed pivoting would be equally good * numerically, and more run-time efficient. So, in * an optimal implementation, the next call to DGEQP3 * should be replaced with eg. CALL SGEQPX (ACM TOMS #782) @@ -1388,7 +1388,7 @@ * IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 -* (this overwrittes the copy of R2, as it will not be +* (this overwrites the copy of R2, as it will not be * needed in this branch, but it does not overwritte the * Huseholder vectors of Q2.). CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) @@ -1638,7 +1638,7 @@ * * This branch deploys a preconditioned Jacobi SVD with explicitly * accumulated rotations. It is included as optional, mainly for -* experimental purposes. It does perfom well, and can also be used. +* experimental purposes. It does perform well, and can also be used. * In this implementation, this branch will be automatically activated * if the condition number sigma_max(A) / sigma_min(A) is predicted * to be greater than the overflow threshold. This is because the diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index ece645079..fc14d892f 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -1,3 +1,4 @@ +*> \brief \b DGELQ * * Definition: * =========== @@ -17,7 +18,17 @@ * ============= *> *> \verbatim -*> DGELQ computes a LQ factorization of an M-by-N matrix A. +*> +*> DGELQ computes an LQ factorization of a real M-by-N matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -138,7 +149,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -159,10 +170,10 @@ SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/dgelq2.f b/lapack-netlib/SRC/dgelq2.f index 04aa57fc1..a6c835de4 100644 --- a/lapack-netlib/SRC/dgelq2.f +++ b/lapack-netlib/SRC/dgelq2.f @@ -33,8 +33,16 @@ *> *> \verbatim *> -*> DGELQ2 computes an LQ factorization of a real m by n matrix A: -*> A = L * Q. +*> DGELQ2 computes an LQ factorization of a real m-by-n matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a n-by-n orthogonal matrix; +*> L is an lower-triangular m-by-m matrix; +*> 0 is a m-by-(n-m) zero matrix, if m < n. +*> *> \endverbatim * * Arguments: @@ -96,7 +104,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup doubleGEcomputational * @@ -121,10 +129,10 @@ * ===================================================================== SUBROUTINE DGELQ2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f index 834c47168..4b11761f6 100644 --- a/lapack-netlib/SRC/dgelqf.f +++ b/lapack-netlib/SRC/dgelqf.f @@ -34,7 +34,15 @@ *> \verbatim *> *> DGELQF computes an LQ factorization of a real M-by-N matrix A: -*> A = L * Q. +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -110,7 +118,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup doubleGEcomputational * @@ -135,10 +143,10 @@ * ===================================================================== SUBROUTINE DGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f index bb6b2868f..dea693c24 100644 --- a/lapack-netlib/SRC/dgemlq.f +++ b/lapack-netlib/SRC/dgemlq.f @@ -1,3 +1,4 @@ +*> \brief \b DGEMLQ * * Definition: * =========== @@ -144,7 +145,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f index 8509b13d9..0f7a42233 100644 --- a/lapack-netlib/SRC/dgemqr.f +++ b/lapack-netlib/SRC/dgemqr.f @@ -1,3 +1,4 @@ +*> \brief \b DGEMQR * * Definition: * =========== @@ -144,7 +145,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f index d0a1a18f9..0bff5d1f9 100644 --- a/lapack-netlib/SRC/dgeqr.f +++ b/lapack-netlib/SRC/dgeqr.f @@ -1,3 +1,4 @@ +*> \brief \b DGEQR * * Definition: * =========== @@ -17,7 +18,18 @@ * ============= *> *> \verbatim -*> DGEQR computes a QR factorization of an M-by-N matrix A. +*> +*> DGEQR computes a QR factorization of a real M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -138,7 +150,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -160,10 +172,10 @@ SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/dgeqr2.f b/lapack-netlib/SRC/dgeqr2.f index c1e91e9bd..9ce1feb25 100644 --- a/lapack-netlib/SRC/dgeqr2.f +++ b/lapack-netlib/SRC/dgeqr2.f @@ -33,8 +33,17 @@ *> *> \verbatim *> -*> DGEQR2 computes a QR factorization of a real m by n matrix A: -*> A = Q * R. +*> DGEQR2 computes a QR factorization of a real m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -96,7 +105,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup doubleGEcomputational * @@ -121,10 +130,10 @@ * ===================================================================== SUBROUTINE DGEQR2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/dgeqr2p.f b/lapack-netlib/SRC/dgeqr2p.f index 921f79921..9b81ccb33 100644 --- a/lapack-netlib/SRC/dgeqr2p.f +++ b/lapack-netlib/SRC/dgeqr2p.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> DGEQR2P computes a QR factorization of a real m by n matrix A: -*> A = Q * R. The diagonal entries of R are nonnegative. +*> DGEQR2P computes a QR factorization of a real m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix with nonnegative diagonal +*> entries; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -97,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup doubleGEcomputational * @@ -124,10 +134,10 @@ * ===================================================================== SUBROUTINE DGEQR2P( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/dgeqrf.f b/lapack-netlib/SRC/dgeqrf.f index 83d7d8dd7..98666221f 100644 --- a/lapack-netlib/SRC/dgeqrf.f +++ b/lapack-netlib/SRC/dgeqrf.f @@ -34,7 +34,16 @@ *> \verbatim *> *> DGEQRF computes a QR factorization of a real M-by-N matrix A: -*> A = Q * R. +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -111,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup doubleGEcomputational * @@ -136,10 +145,10 @@ * ===================================================================== SUBROUTINE DGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f index d182f98c9..5cf4069ed 100644 --- a/lapack-netlib/SRC/dgeqrfp.f +++ b/lapack-netlib/SRC/dgeqrfp.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> DGEQRFP computes a QR factorization of a real M-by-N matrix A: -*> A = Q * R. The diagonal entries of R are nonnegative. +*> DGEQR2P computes a QR factorization of a real M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix with nonnegative diagonal +*> entries; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -112,7 +122,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup doubleGEcomputational * @@ -139,10 +149,10 @@ * ===================================================================== SUBROUTINE DGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/dgerfsx.f b/lapack-netlib/SRC/dgerfsx.f index aafca8d10..495ea1726 100644 --- a/lapack-netlib/SRC/dgerfsx.f +++ b/lapack-netlib/SRC/dgerfsx.f @@ -283,7 +283,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -319,14 +319,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension (NPARAMS) -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -334,9 +334,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/dgesc2.f b/lapack-netlib/SRC/dgesc2.f index 2f01a762f..72d8a38f0 100644 --- a/lapack-netlib/SRC/dgesc2.f +++ b/lapack-netlib/SRC/dgesc2.f @@ -90,7 +90,7 @@ *> \verbatim *> SCALE is DOUBLE PRECISION *> On exit, SCALE contains the scale factor. SCALE is chosen -*> 0 <= SCALE <= 1 to prevent owerflow in the solution. +*> 0 <= SCALE <= 1 to prevent overflow in the solution. *> \endverbatim * * Authors: @@ -151,7 +151,7 @@ * .. * .. Executable Statements .. * -* Set constant to control owerflow +* Set constant to control overflow * EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) / EPS diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f index 926607f98..0218900d2 100644 --- a/lapack-netlib/SRC/dgesdd.f +++ b/lapack-netlib/SRC/dgesdd.f @@ -322,7 +322,7 @@ * IF( WNTQN ) THEN * dbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6) -* keep 7*N for backwards compatability. +* keep 7*N for backwards compatibility. BDSPAC = 7*N ELSE BDSPAC = 3*N*N + 4*N @@ -448,7 +448,7 @@ * IF( WNTQN ) THEN * dbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6) -* keep 7*N for backwards compatability. +* keep 7*N for backwards compatibility. BDSPAC = 7*M ELSE BDSPAC = 3*M*M + 4*M diff --git a/lapack-netlib/SRC/dgesvdq.f b/lapack-netlib/SRC/dgesvdq.f new file mode 100644 index 000000000..e495d2bf9 --- /dev/null +++ b/lapack-netlib/SRC/dgesvdq.f @@ -0,0 +1,1385 @@ +*> \brief DGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGESVDQ + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, +* S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, +* WORK, LWORK, RWORK, LRWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV +* INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK, +* INFO +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * ) +* DOUBLE PRECISION S( * ), RWORK( * ) +* INTEGER IWORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGESVDQ computes the singular value decomposition (SVD) of a real +*> M-by-N matrix A, where M >= N. The SVD of A is written as +*> [++] [xx] [x0] [xx] +*> A = U * SIGMA * V^*, [++] = [xx] * [ox] * [xx] +*> [++] [xx] +*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal +*> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements +*> of SIGMA are the singular values of A. The columns of U and V are the +*> left and the right singular vectors of A, respectively. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy in the computed SVD +*> = 'A' The requested accuracy corresponds to having the backward +*> error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F, +*> where EPS = DLAMCH('Epsilon'). This authorises DGESVDQ to +*> truncate the computed triangular factor in a rank revealing +*> QR factorization whenever the truncated part is below the +*> threshold of the order of EPS * ||A||_F. This is aggressive +*> truncation level. +*> = 'M' Similarly as with 'A', but the truncation is more gentle: it +*> is allowed only when there is a drop on the diagonal of the +*> triangular factor in the QR factorization. This is medium +*> truncation level. +*> = 'H' High accuracy requested. No numerical rank determination based +*> on the rank revealing QR factorization is attempted. +*> = 'E' Same as 'H', and in addition the condition number of column +*> scaled A is estimated and returned in RWORK(1). +*> N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1) +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> = 'P' The rows of A are ordered in decreasing order with respect to +*> ||A(i,:)||_\infty. This enhances numerical accuracy at the cost +*> of extra data movement. Recommended for numerical robustness. +*> = 'N' No row pivoting. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> = 'T' After the initial pivoted QR factorization, DGESVD is applied to +*> the transposed R**T of the computed triangular factor R. This involves +*> some extra data movement (matrix transpositions). Useful for +*> experiments, research and development. +*> = 'N' The triangular factor R is given as input to DGESVD. This may be +*> preferred as it involves less data movement. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> = 'A' All M left singular vectors are computed and returned in the +*> matrix U. See the description of U. +*> = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned +*> in the matrix U. See the description of U. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular +*> vectors are computed and returned in the matrix U. +*> = 'F' The N left singular vectors are returned in factored form as the +*> product of the Q factor from the initial QR factorization and the +*> N left singular vectors of (R**T , 0)**T. If row pivoting is used, +*> then the necessary information on the row pivoting is stored in +*> IWORK(N+1:N+M-1). +*> = 'N' The left singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> = 'A', 'V' All N right singular vectors are computed and returned in +*> the matrix V. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular +*> vectors are computed and returned in the matrix V. This option is +*> allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal. +*> = 'N' The right singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array of dimensions LDA x N +*> On entry, the input matrix A. +*> On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains +*> the Householder vectors as stored by DGEQP3. If JOBU = 'F', these Householder +*> vectors together with WORK(1:N) can be used to restore the Q factors from +*> the initial pivoted QR factorization of A. See the description of U. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER. +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array of dimension N. +*> The singular values of A, ordered so that S(i) >= S(i+1). +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is DOUBLE PRECISION array, dimension +*> LDU x M if JOBU = 'A'; see the description of LDU. In this case, +*> on exit, U contains the M left singular vectors. +*> LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this +*> case, U contains the leading N or the leading NUMRANK left singular vectors. +*> LDU x N if JOBU = 'F' ; see the description of LDU. In this case U +*> contains N x N orthogonal matrix that can be used to form the left +*> singular vectors. +*> If JOBU = 'N', U is not referenced. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER. +*> The leading dimension of the array U. +*> If JOBU = 'A', 'S', 'U', 'R', LDU >= max(1,M). +*> If JOBU = 'F', LDU >= max(1,N). +*> Otherwise, LDU >= 1. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension +*> LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' . +*> If JOBV = 'A', or 'V', V contains the N-by-N orthogonal matrix V**T; +*> If JOBV = 'R', V contains the first NUMRANK rows of V**T (the right +*> singular vectors, stored rowwise, of the NUMRANK largest singular values). +*> If JOBV = 'N' and JOBA = 'E', V is used as a workspace. +*> If JOBV = 'N', and JOBA.NE.'E', V is not referenced. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If JOBV = 'A', 'V', 'R', or JOBA = 'E', LDV >= max(1,N). +*> Otherwise, LDV >= 1. +*> \endverbatim +*> +*> \param[out] NUMRANK +*> \verbatim +*> NUMRANK is INTEGER +*> NUMRANK is the numerical rank first determined after the rank +*> revealing QR factorization, following the strategy specified by the +*> value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK +*> leading singular values and vectors are then requested in the call +*> of DGESVD. The final value of NUMRANK might be further reduced if +*> some singular values are computed as zeros. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (max(1, LIWORK)). +*> On exit, IWORK(1:N) contains column pivoting permutation of the +*> rank revealing QR factorization. +*> If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence +*> of row swaps used in row pivoting. These can be used to restore the +*> left singular vectors in the case JOBU = 'F'. +*> +*> If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> LIWORK(1) returns the minimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> LIWORK >= N + M - 1, if JOBP = 'P' and JOBA .NE. 'E'; +*> LIWORK >= N if JOBP = 'N' and JOBA .NE. 'E'; +*> LIWORK >= N + M - 1 + N, if JOBP = 'P' and JOBA = 'E'; +*> LIWORK >= N + N if JOBP = 'N' and JOBA = 'E'. +* +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the WORK, IWORK, and RWORK arrays, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (max(2, LWORK)), used as a workspace. +*> On exit, if, on entry, LWORK.NE.-1, WORK(1:N) contains parameters +*> needed to recover the Q factor from the QR factorization computed by +*> DGEQP3. +*> +*> If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> WORK(1) returns the optimal LWORK, and +*> WORK(2) returns the minimal LWORK. +*> \endverbatim +*> +*> \param[in,out] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. It is determined as follows: +*> Let LWQP3 = 3*N+1, LWCON = 3*N, and let +*> LWORQ = { MAX( N, 1 ), if JOBU = 'R', 'S', or 'U' +*> { MAX( M, 1 ), if JOBU = 'A' +*> LWSVD = MAX( 5*N, 1 ) +*> LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 5*(N/2), 1 ), LWORLQ = MAX( N, 1 ), +*> LWQRF = MAX( N/2, 1 ), LWORQ2 = MAX( N, 1 ) +*> Then the minimal value of LWORK is: +*> = MAX( N + LWQP3, LWSVD ) if only the singular values are needed; +*> = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed, +*> and a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWORQ ) if the singular values and the left +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the singular values and the left +*> singular vectors are requested, and also +*> a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD ) if the singular values and the right +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right +*> singular vectors are requested, and also +*> a scaled condition etimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWORQ ) if the full SVD is requested with JOBV = 'R'; +*> independent of JOBR; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the full SVD is requested, +*> JOBV = 'R' and, also a scaled condition +*> estimate requested; independent of JOBR; +*> = MAX( N + MAX( LWQP3, LWSVD, LWORQ ), +*> N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ) ) if the +*> full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ ) ) +*> if the full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N', and also a scaled condition number estimate +*> requested. +*> = MAX( N + MAX( LWQP3, LWSVD, LWORQ ), +*> N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) ) if the +*> full SVD is requested with JOBV = 'A', 'V', and JOBR ='T' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) ) +*> if the full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='T', and also a scaled condition number estimate +*> requested. +*> Finally, LWORK must be at least two: LWORK = MAX( 2, LWORK ). +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the WORK, IWORK, and RWORK arrays, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (max(1, LRWORK)). +*> On exit, +*> 1. If JOBA = 'E', RWORK(1) contains an estimate of the condition +*> number of column scaled A. If A = C * D where D is diagonal and C +*> has unit columns in the Euclidean norm, then, assuming full column rank, +*> N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1). +*> Otherwise, RWORK(1) = -1. +*> 2. RWORK(2) contains the number of singular values computed as +*> exact zeros in DGESVD applied to the upper triangular or trapeziodal +*> R (from the initial QR factorization). In case of early exit (no call to +*> DGESVD, such as in the case of zero matrix) RWORK(2) = -1. +*> +*> If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> RWORK(1) returns the minimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER. +*> The dimension of the array RWORK. +*> If JOBP ='P', then LRWORK >= MAX(2, M). +*> Otherwise, LRWORK >= 2 +* +*> If LRWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the WORK, IWORK, and RWORK arrays, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if DBDSQR did not converge, INFO specifies how many superdiagonals +*> of an intermediate bidiagonal form B (computed in DGESVD) did not +*> converge to zero. +*> \endverbatim +* +*> \par Further Details: +* ======================== +*> +*> \verbatim +*> +*> 1. The data movement (matrix transpose) is coded using simple nested +*> DO-loops because BLAS and LAPACK do not provide corresponding subroutines. +*> Those DO-loops are easily identified in this source code - by the CONTINUE +*> statements labeled with 11**. In an optimized version of this code, the +*> nested DO loops should be replaced with calls to an optimized subroutine. +*> 2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause +*> column norm overflow. This is the minial precaution and it is left to the +*> SVD routine (CGESVD) to do its own preemptive scaling if potential over- +*> or underflows are detected. To avoid repeated scanning of the array A, +*> an optimal implementation would do all necessary scaling before calling +*> CGESVD and the scaling in CGESVD can be switched off. +*> 3. Other comments related to code optimization are given in comments in the +*> code, enlosed in [[double brackets]]. +*> \endverbatim +* +*> \par Bugs, examples and comments +* =========================== +* +*> \verbatim +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> \endverbatim +* +*> \par References +* =============== +* +*> \verbatim +*> [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for +*> Computing the SVD with High Accuracy. ACM Trans. Math. Softw. +*> 44(1): 11:1-11:30 (2017) +*> +*> SIGMA library, xGESVDQ section updated February 2016. +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2018 +* +*> \ingroup doubleGEsing +* +* ===================================================================== + SUBROUTINE DGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, + $ S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, + $ WORK, LWORK, RWORK, LRWORK, INFO ) +* .. Scalar Arguments .. + IMPLICIT NONE + CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV + INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK, + $ INFO +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * ) + DOUBLE PRECISION S( * ), RWORK( * ) + INTEGER IWORK( * ) +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Local Scalars .. + INTEGER IERR, IWOFF, NR, N1, OPTRATIO, p, q + INTEGER LWCON, LWQP3, LWRK_DGELQF, LWRK_DGESVD, LWRK_DGESVD2, + $ LWRK_DGEQP3, LWRK_DGEQRF, LWRK_DORMLQ, LWRK_DORMQR, + $ LWRK_DORMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWORQ, + $ LWORQ2, LWORLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2, + $ IMINWRK, RMINWRK + LOGICAL ACCLA, ACCLM, ACCLH, ASCALED, CONDA, DNTWU, DNTWV, + $ LQUERY, LSVC0, LSVEC, ROWPRM, RSVEC, RTRANS, WNTUA, + $ WNTUF, WNTUR, WNTUS, WNTVA, WNTVR + DOUBLE PRECISION BIG, EPSLN, RTMP, SCONDA, SFMIN +* .. Local Arrays + DOUBLE PRECISION RDUMMY(1) +* .. +* .. External Subroutines (BLAS, LAPACK) + EXTERNAL DGELQF, DGEQP3, DGEQRF, DGESVD, DLACPY, DLAPMT, + $ DLASCL, DLASET, DLASWP, DSCAL, DPOCON, DORMLQ, + $ DORMQR, XERBLA +* .. +* .. External Functions (BLAS, LAPACK) + LOGICAL LSAME + INTEGER IDAMAX + DOUBLE PRECISION DLANGE, DNRM2, DLAMCH + EXTERNAL DLANGE, LSAME, IDAMAX, DNRM2, DLAMCH +* .. +* .. Intrinsic Functions .. +* + INTRINSIC ABS, MAX, MIN, DBLE, SQRT +* +* Test the input arguments +* + WNTUS = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' ) + WNTUR = LSAME( JOBU, 'R' ) + WNTUA = LSAME( JOBU, 'A' ) + WNTUF = LSAME( JOBU, 'F' ) + LSVC0 = WNTUS .OR. WNTUR .OR. WNTUA + LSVEC = LSVC0 .OR. WNTUF + DNTWU = LSAME( JOBU, 'N' ) +* + WNTVR = LSAME( JOBV, 'R' ) + WNTVA = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' ) + RSVEC = WNTVR .OR. WNTVA + DNTWV = LSAME( JOBV, 'N' ) +* + ACCLA = LSAME( JOBA, 'A' ) + ACCLM = LSAME( JOBA, 'M' ) + CONDA = LSAME( JOBA, 'E' ) + ACCLH = LSAME( JOBA, 'H' ) .OR. CONDA +* + ROWPRM = LSAME( JOBP, 'P' ) + RTRANS = LSAME( JOBR, 'T' ) +* + IF ( ROWPRM ) THEN + IF ( CONDA ) THEN + IMINWRK = MAX( 1, N + M - 1 + N ) + ELSE + IMINWRK = MAX( 1, N + M - 1 ) + END IF + RMINWRK = MAX( 2, M ) + ELSE + IF ( CONDA ) THEN + IMINWRK = MAX( 1, N + N ) + ELSE + IMINWRK = MAX( 1, N ) + END IF + RMINWRK = 2 + END IF + LQUERY = (LIWORK .EQ. -1 .OR. LWORK .EQ. -1 .OR. LRWORK .EQ. -1) + INFO = 0 + IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN + INFO = -1 + ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = -2 + ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN + INFO = -3 + ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN + INFO = -4 + ELSE IF ( WNTUR .AND. WNTVA ) THEN + INFO = -5 + ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN + INFO = -5 + ELSE IF ( M.LT.0 ) THEN + INFO = -6 + ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN + INFO = -7 + ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN + INFO = -9 + ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR. + $ ( WNTUF .AND. LDU.LT.N ) ) THEN + INFO = -12 + ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR. + $ ( CONDA .AND. LDV.LT.N ) ) THEN + INFO = -14 + ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN + INFO = -17 + END IF +* +* + IF ( INFO .EQ. 0 ) THEN +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LWORK are written with a lot of redundancy and +* can be simplified. However, this detailed form is easier for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for DGEQP3 of an M x N matrix + LWQP3 = 3 * N + 1 +* .. minimal workspace length for DORMQR to build left singular vectors + IF ( WNTUS .OR. WNTUR ) THEN + LWORQ = MAX( N , 1 ) + ELSE IF ( WNTUA ) THEN + LWORQ = MAX( M , 1 ) + END IF +* .. minimal workspace length for DPOCON of an N x N matrix + LWCON = 3 * N +* .. DGESVD of an N x N matrix + LWSVD = MAX( 5 * N, 1 ) + IF ( LQUERY ) THEN + CALL DGEQP3( M, N, A, LDA, IWORK, RDUMMY, RDUMMY, -1, + $ IERR ) + LWRK_DGEQP3 = INT( RDUMMY(1) ) + IF ( WNTUS .OR. WNTUR ) THEN + CALL DORMQR( 'L', 'N', M, N, N, A, LDA, RDUMMY, U, + $ LDU, RDUMMY, -1, IERR ) + LWRK_DORMQR = INT( RDUMMY(1) ) + ELSE IF ( WNTUA ) THEN + CALL DORMQR( 'L', 'N', M, M, N, A, LDA, RDUMMY, U, + $ LDU, RDUMMY, -1, IERR ) + LWRK_DORMQR = INT( RDUMMY(1) ) + ELSE + LWRK_DORMQR = 0 + END IF + END IF + MINWRK = 2 + OPTWRK = 2 + IF ( .NOT. (LSVEC .OR. RSVEC )) THEN +* .. minimal and optimal sizes of the workspace if +* only the singular values are requested + IF ( CONDA ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + CALL DGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_DGESVD = INT( RDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = MAX( N+LWRK_DGEQP3, N+LWCON, LWRK_DGESVD ) + ELSE + OPTWRK = MAX( N+LWRK_DGEQP3, LWRK_DGESVD ) + END IF + END IF + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the workspace if the +* singular values and the left singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD, LWORQ ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL DGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + ELSE + CALL DGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + END IF + LWRK_DGESVD = INT( RDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_DGEQP3, LWCON, LWRK_DGESVD, + $ LWRK_DORMQR ) + ELSE + OPTWRK = N + MAX( LWRK_DGEQP3, LWRK_DGESVD, + $ LWRK_DORMQR ) + END IF + END IF + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the workspace if the +* singular values and the right singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL DGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + ELSE + CALL DGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + END IF + LWRK_DGESVD = INT( RDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_DGEQP3, LWCON, LWRK_DGESVD ) + ELSE + OPTWRK = N + MAX( LWRK_DGEQP3, LWRK_DGESVD ) + END IF + END IF + ELSE +* .. minimal and optimal sizes of the workspace if the +* full SVD is requested + IF ( RTRANS ) THEN + MINWRK = MAX( LWQP3, LWSVD, LWORQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N x N/2 DGEQRF + LWQRF = MAX( N/2, 1 ) +* .. minimal workspace lengt for N/2 x N/2 DGESVD + LWSVD2 = MAX( 5 * (N/2), 1 ) + LWORQ2 = MAX( N, 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, + $ N/2+LWORQ2, LWORQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + ELSE + MINWRK = MAX( LWQP3, LWSVD, LWORQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N/2 x N DGELQF + LWLQF = MAX( N/2, 1 ) + LWSVD2 = MAX( 5 * (N/2), 1 ) + LWORLQ = MAX( N , 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, + $ N/2+LWORLQ, LWORQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL DGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_DGESVD = INT( RDUMMY(1) ) + OPTWRK = MAX(LWRK_DGEQP3,LWRK_DGESVD,LWRK_DORMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL DGEQRF(N,N/2,U,LDU,RDUMMY,RDUMMY,-1,IERR) + LWRK_DGEQRF = INT( RDUMMY(1) ) + CALL DGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_DGESVD2 = INT( RDUMMY(1) ) + CALL DORMQR( 'R', 'C', N, N, N/2, U, LDU, RDUMMY, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_DORMQR2 = INT( RDUMMY(1) ) + OPTWRK2 = MAX( LWRK_DGEQP3, N/2+LWRK_DGEQRF, + $ N/2+LWRK_DGESVD2, N/2+LWRK_DORMQR2 ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + ELSE + CALL DGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_DGESVD = INT( RDUMMY(1) ) + OPTWRK = MAX(LWRK_DGEQP3,LWRK_DGESVD,LWRK_DORMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL DGELQF(N/2,N,U,LDU,RDUMMY,RDUMMY,-1,IERR) + LWRK_DGELQF = INT( RDUMMY(1) ) + CALL DGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_DGESVD2 = INT( RDUMMY(1) ) + CALL DORMLQ( 'R', 'N', N, N, N/2, U, LDU, RDUMMY, + $ V, LDV, RDUMMY,-1,IERR ) + LWRK_DORMLQ = INT( RDUMMY(1) ) + OPTWRK2 = MAX( LWRK_DGEQP3, N/2+LWRK_DGELQF, + $ N/2+LWRK_DGESVD2, N/2+LWRK_DORMLQ ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + END IF + END IF + END IF +* + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( 2, OPTWRK ) + IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19 +* + END IF +* + IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN + INFO = -21 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGESVDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +* +* Return optimal workspace +* + IWORK(1) = IMINWRK + WORK(1) = OPTWRK + WORK(2) = MINWRK + RWORK(1) = RMINWRK + RETURN + END IF +* +* Quick return if the matrix is void. +* + IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN +* .. all output is void. + RETURN + END IF +* + BIG = DLAMCH('O') + ASCALED = .FALSE. + IWOFF = 1 + IF ( ROWPRM ) THEN + IWOFF = M +* .. reordering the rows in decreasing sequence in the +* ell-infinity norm - this enhances numerical robustness in +* the case of differently scaled rows. + DO 1904 p = 1, M +* RWORK(p) = ABS( A(p,ICAMAX(N,A(p,1),LDA)) ) +* [[DLANGE will return NaN if an entry of the p-th row is Nan]] + RWORK(p) = DLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY ) +* .. check for NaN's and Inf's + IF ( ( RWORK(p) .NE. RWORK(p) ) .OR. + $ ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN + INFO = -8 + CALL XERBLA( 'DGESVDQ', -INFO ) + RETURN + END IF + 1904 CONTINUE + DO 1952 p = 1, M - 1 + q = IDAMAX( M-p+1, RWORK(p), 1 ) + p - 1 + IWORK(N+p) = q + IF ( p .NE. q ) THEN + RTMP = RWORK(p) + RWORK(p) = RWORK(q) + RWORK(q) = RTMP + END IF + 1952 CONTINUE +* + IF ( RWORK(1) .EQ. ZERO ) THEN +* Quick return: A is the M x N zero matrix. + NUMRANK = 0 + CALL DLASET( 'G', N, 1, ZERO, ZERO, S, N ) + IF ( WNTUS ) CALL DLASET('G', M, N, ZERO, ONE, U, LDU) + IF ( WNTUA ) CALL DLASET('G', M, M, ZERO, ONE, U, LDU) + IF ( WNTVA ) CALL DLASET('G', N, N, ZERO, ONE, V, LDV) + IF ( WNTUF ) THEN + CALL DLASET( 'G', N, 1, ZERO, ZERO, WORK, N ) + CALL DLASET( 'G', M, N, ZERO, ONE, U, LDU ) + END IF + DO 5001 p = 1, N + IWORK(p) = p + 5001 CONTINUE + IF ( ROWPRM ) THEN + DO 5002 p = N + 1, N + M - 1 + IWORK(p) = p - N + 5002 CONTINUE + END IF + IF ( CONDA ) RWORK(1) = -1 + RWORK(2) = -1 + RETURN + END IF +* + IF ( RWORK(1) .GT. BIG / SQRT(DBLE(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL DLASCL('G',0,0,SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + CALL DLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 ) + END IF +* +* .. At this stage, preemptive scaling is done only to avoid column +* norms overflows during the QR factorization. The SVD procedure should +* have its own scaling to save the singular values from overflows and +* underflows. That depends on the SVD procedure. +* + IF ( .NOT.ROWPRM ) THEN + RTMP = DLANGE( 'M', M, N, A, LDA, RDUMMY ) + IF ( ( RTMP .NE. RTMP ) .OR. + $ ( (RTMP*ZERO) .NE. ZERO ) ) THEN + INFO = -8 + CALL XERBLA( 'DGESVDQ', -INFO ) + RETURN + END IF + IF ( RTMP .GT. BIG / SQRT(DBLE(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL DLASCL('G',0,0, SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + END IF +* +* .. QR factorization with column pivoting +* +* A * P = Q * [ R ] +* [ 0 ] +* + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL DGEQP3( M, N, A, LDA, IWORK, WORK, WORK(N+1), LWORK-N, + $ IERR ) +* +* If the user requested accuracy level allows truncation in the +* computed upper triangular factor, the matrix R is examined and, +* if possible, replaced with its leading upper trapezoidal part. +* + EPSLN = DLAMCH('E') + SFMIN = DLAMCH('S') +* SMALL = SFMIN / EPSLN + NR = N +* + IF ( ACCLA ) THEN +* +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPS*||A||_F are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPS*||A||_F. + NR = 1 + RTMP = SQRT(DBLE(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002 + NR = NR + 1 + 3001 CONTINUE + 3002 CONTINUE +* + ELSEIF ( ACCLM ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R is used as the criterion for being +* close-to-rank-deficient. The threshold is set to EPSLN=DLAMCH('E'). +* [[This can be made more flexible by replacing this hard-coded value +* with a user specified threshold.]] Also, the values that underflow +* will be truncated. + NR = 1 + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* .. RRQR not authorized to determine numerical rank except in the +* obvious case of zero pivots. +* .. inspect R for exact zeros on the diagonal; +* R(i,i)=0 => R(i:N,i:N)=0. + NR = 1 + DO 3501 p = 2, N + IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502 + NR = NR + 1 + 3501 CONTINUE + 3502 CONTINUE +* + IF ( CONDA ) THEN +* Estimate the scaled condition number of A. Use the fact that it is +* the same as the scaled condition number of R. +* .. V is used as workspace + CALL DLACPY( 'U', N, N, A, LDA, V, LDV ) +* Only the leading NR x NR submatrix of the triangular factor +* is considered. Only if NR=N will this give a reliable error +* bound. However, even for NR < N, this can be used on an +* expert level and obtain useful information in the sense of +* perturbation theory. + DO 3053 p = 1, NR + RTMP = DNRM2( p, V(1,p), 1 ) + CALL DSCAL( p, ONE/RTMP, V(1,p), 1 ) + 3053 CONTINUE + IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN + CALL DPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ WORK, IWORK(N+IWOFF), IERR ) + ELSE + CALL DPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ WORK(N+1), IWORK(N+IWOFF), IERR ) + END IF + SCONDA = ONE / SQRT(RTMP) +* For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1), +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +* See the reference [1] for more details. + END IF +* + ENDIF +* + IF ( WNTUR ) THEN + N1 = NR + ELSE IF ( WNTUS .OR. WNTUF) THEN + N1 = N + ELSE IF ( WNTUA ) THEN + N1 = M + END IF +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +*....................................................................... +* .. only the singular values are requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. compute the singular values of R**T = [A](1:NR,1:N)**T +* .. set the lower triangle of [A] to [A](1:NR,1:N)**T and +* the upper triangle of [A] to zero. + DO 1146 p = 1, MIN( N, NR ) + DO 1147 q = p + 1, N + A(q,p) = A(p,q) + IF ( q .LE. NR ) A(p,q) = ZERO + 1147 CONTINUE + 1146 CONTINUE +* + CALL DGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU, + $ V, LDV, WORK, LWORK, INFO ) +* + ELSE +* +* .. compute the singular values of R = [A](1:NR,1:N) +* + IF ( NR .GT. 1 ) + $ CALL DLASET( 'L', NR-1,NR-1, ZERO,ZERO, A(2,1), LDA ) + CALL DGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU, + $ V, LDV, WORK, LWORK, INFO ) +* + END IF +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN +*....................................................................... +* .. the singular values and the left singular vectors requested +*......................................................................."""""""" + IF ( RTRANS ) THEN +* .. apply DGESVD to R**T +* .. copy R**T into [U] and overwrite [U] with the right singular +* vectors of R + DO 1192 p = 1, NR + DO 1193 q = p, N + U(q,p) = A(p,q) + 1193 CONTINUE + 1192 CONTINUE + IF ( NR .GT. 1 ) + $ CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, U(1,2), LDU ) +* .. the left singular vectors not computed, the NR right singular +* vectors overwrite [U](1:NR,1:NR) as transposed. These +* will be pre-multiplied by Q to build the left singular vectors of A. + CALL DGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1119 p = 1, NR + DO 1120 q = p + 1, NR + RTMP = U(q,p) + U(q,p) = U(p,q) + U(p,q) = RTMP + 1120 CONTINUE + 1119 CONTINUE +* + ELSE +* .. apply DGESVD to R +* .. copy R into [U] and overwrite [U] with the left singular vectors + CALL DLACPY( 'U', NR, N, A, LDA, U, LDU ) + IF ( NR .GT. 1 ) + $ CALL DLASET( 'L', NR-1, NR-1, ZERO, ZERO, U(2,1), LDU ) +* .. the right singular vectors not computed, the NR left singular +* vectors overwrite [U](1:NR,1:NR) + CALL DGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) +* .. now [U](1:NR,1:NR) contains the NR left singular vectors of +* R. These will be pre-multiplied by Q to build the left singular +* vectors of A. + END IF +* +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN + CALL DLASET('A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL DLASET( 'A',NR,N1-NR,ZERO,ZERO,U(1,NR+1), LDU ) + CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT.WNTUF ) + $ CALL DORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* + ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN +*....................................................................... +* .. the singular values and the right singular vectors requested +*....................................................................... + IF ( RTRANS ) THEN +* .. apply DGESVD to R**T +* .. copy R**T into V and overwrite V with the left singular vectors + DO 1165 p = 1, NR + DO 1166 q = p, N + V(q,p) = (A(p,q)) + 1166 CONTINUE + 1165 CONTINUE + IF ( NR .GT. 1 ) + $ CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV ) +* .. the left singular vectors of R**T overwrite V, the right singular +* vectors not computed + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL DGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1121 p = 1, NR + DO 1122 q = p + 1, NR + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1122 CONTINUE + 1121 CONTINUE +* + IF ( NR .LT. N ) THEN + DO 1103 p = 1, NR + DO 1104 q = NR + 1, N + V(p,q) = V(q,p) + 1104 CONTINUE + 1103 CONTINUE + END IF + CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK ) + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:N,1:NR) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the QR factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL DLASET('G', N, N-NR, ZERO, ZERO, V(1,NR+1), LDV) + CALL DGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1123 p = 1, N + DO 1124 q = p + 1, N + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1124 CONTINUE + 1123 CONTINUE + CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* + ELSE +* .. aply DGESVD to R +* .. copy R into V and overwrite V with the right singular vectors + CALL DLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL DLASET( 'L', NR-1, NR-1, ZERO, ZERO, V(2,1), LDV ) +* .. the right singular vectors overwrite V, the NR left singular +* vectors stored in U(1:NR,1:NR) + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL DGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:NR,1:N) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the LQ factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL DLASET('G', N-NR, N, ZERO,ZERO, V(NR+1,1), LDV) + CALL DGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* .. now [V] contains the transposed matrix of the right singular +* vectors of A. + END IF +* + ELSE +*....................................................................... +* .. FULL SVD requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. apply DGESVD to R**T [[this option is left for R&D&T]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R**T into [V] and overwrite [V] with the left singular +* vectors of R**T + DO 1168 p = 1, NR + DO 1169 q = p, N + V(q,p) = A(p,q) + 1169 CONTINUE + 1168 CONTINUE + IF ( NR .GT. 1 ) + $ CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV ) +* +* .. the left singular vectors of R**T overwrite [V], the NR right +* singular vectors of R**T stored in [U](1:NR,1:NR) as transposed + CALL DGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* .. assemble V + DO 1115 p = 1, NR + DO 1116 q = p + 1, NR + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1116 CONTINUE + 1115 CONTINUE + IF ( NR .LT. N ) THEN + DO 1101 p = 1, NR + DO 1102 q = NR+1, N + V(p,q) = V(q,p) + 1102 CONTINUE + 1101 CONTINUE + END IF + CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* + DO 1117 p = 1, NR + DO 1118 q = p + 1, NR + RTMP = U(q,p) + U(q,p) = U(p,q) + U(p,q) = RTMP + 1118 CONTINUE + 1117 CONTINUE +* + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL DLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. copy R**T into [V] and overwrite [V] with the left singular +* vectors of R**T +* [[The optimal ratio N/NR for using QRF instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'DGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO*NR .GT. N ) THEN + DO 1198 p = 1, NR + DO 1199 q = p, N + V(q,p) = A(p,q) + 1199 CONTINUE + 1198 CONTINUE + IF ( NR .GT. 1 ) + $ CALL DLASET('U',NR-1,NR-1, ZERO,ZERO, V(1,2),LDV) +* + CALL DLASET('A',N,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL DGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1113 p = 1, N + DO 1114 q = p + 1, N + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1114 CONTINUE + 1113 CONTINUE + CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). +* + DO 1111 p = 1, N + DO 1112 q = p + 1, N + RTMP = U(q,p) + U(q,p) = U(p,q) + U(p,q) = RTMP + 1112 CONTINUE + 1111 CONTINUE +* + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL DLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL DLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU) + CALL DLASET('A',M-N,N1-N,ZERO,ONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE +* .. copy R**T into [U] and overwrite [U] with the right +* singular vectors of R + DO 1196 p = 1, NR + DO 1197 q = p, N + U(q,NR+p) = A(p,q) + 1197 CONTINUE + 1196 CONTINUE + IF ( NR .GT. 1 ) + $ CALL DLASET('U',NR-1,NR-1,ZERO,ZERO,U(1,NR+2),LDU) + CALL DGEQRF( N, NR, U(1,NR+1), LDU, WORK(N+1), + $ WORK(N+NR+1), LWORK-N-NR, IERR ) + DO 1143 p = 1, NR + DO 1144 q = 1, N + V(q,p) = U(p,NR+q) + 1144 CONTINUE + 1143 CONTINUE + CALL DLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV) + CALL DGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V,LDV, WORK(N+NR+1),LWORK-N-NR, INFO ) + CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV) + CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV) + CALL DORMQR('R','C', N, N, NR, U(1,NR+1), LDU, + $ WORK(N+1),V,LDV,WORK(N+NR+1),LWORK-N-NR,IERR) + CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL DLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF + END IF + END IF +* + ELSE +* +* .. apply DGESVD to R [[this is the recommended option]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R into [V] and overwrite V with the right singular vectors + CALL DLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL DLASET( 'L', NR-1,NR-1, ZERO,ZERO, V(2,1), LDV ) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL DGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL DLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. the requested number of the left singular vectors +* is then N1 (N or M) +* [[The optimal ratio N/NR for using LQ instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'DGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO * NR .GT. N ) THEN + CALL DLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL DLASET('L', NR-1,NR-1, ZERO,ZERO, V(2,1),LDV) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL DLASET('A', N-NR,N, ZERO,ZERO, V(NR+1,1),LDV) + CALL DGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. now [V] contains the transposed matrix of the right +* singular vectors of A. The leading N left singular vectors +* are in [U](1:N,1:N) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL DLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL DLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU) + CALL DLASET( 'A',M-N,N1-N,ZERO,ONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE + CALL DLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU ) + IF ( NR .GT. 1 ) + $ CALL DLASET('L',NR-1,NR-1,ZERO,ZERO,U(NR+2,1),LDU) + CALL DGELQF( NR, N, U(NR+1,1), LDU, WORK(N+1), + $ WORK(N+NR+1), LWORK-N-NR, IERR ) + CALL DLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV) + IF ( NR .GT. 1 ) + $ CALL DLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV) + CALL DGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+NR+1), LWORK-N-NR, INFO ) + CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV) + CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV) + CALL DORMLQ('R','N',N,N,NR,U(NR+1,1),LDU,WORK(N+1), + $ V, LDV, WORK(N+NR+1),LWORK-N-NR,IERR) + CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL DLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF + END IF + END IF +* .. end of the "R**T or R" branch + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT. WNTUF ) + $ CALL DORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* +* ... end of the "full SVD" branch + END IF +* +* Check whether some singular values are returned as zeros, e.g. +* due to underflow, and update the numerical rank. + p = NR + DO 4001 q = p, 1, -1 + IF ( S(q) .GT. ZERO ) GO TO 4002 + NR = NR - 1 + 4001 CONTINUE + 4002 CONTINUE +* +* .. if numerical rank deficiency is detected, the truncated +* singular values are set to zero. + IF ( NR .LT. N ) CALL DLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N ) +* .. undo scaling; this may cause overflow in the largest singular +* values. + IF ( ASCALED ) + $ CALL DLASCL( 'G',0,0, ONE,SQRT(DBLE(M)), NR,1, S, N, IERR ) + IF ( CONDA ) RWORK(1) = SCONDA + RWORK(2) = p - NR +* .. p-NR is the number of singular values that are computed as +* exact zeros in DGESVD() applied to the (possibly truncated) +* full row rank triangular (trapezoidal) factor of A. + NUMRANK = NR +* + RETURN +* +* End of DGESVDQ +* + END diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f index 2cbc5ce0e..cf7aac982 100644 --- a/lapack-netlib/SRC/dgesvj.f +++ b/lapack-netlib/SRC/dgesvj.f @@ -90,13 +90,13 @@ *> JOBV is CHARACTER*1 *> Specifies whether to compute the right singular vectors, that *> is, the matrix V: -*> = 'V' : the matrix V is computed and returned in the array V -*> = 'A' : the Jacobi rotations are applied to the MV-by-N +*> = 'V': the matrix V is computed and returned in the array V +*> = 'A': the Jacobi rotations are applied to the MV-by-N *> array V. In other words, the right singular vector *> matrix V is not computed explicitly, instead it is *> applied to an MV-by-N matrix initially stored in the *> first MV rows of V. -*> = 'N' : the matrix V is not computed and the array V is not +*> = 'N': the matrix V is not computed and the array V is not *> referenced *> \endverbatim *> @@ -118,8 +118,8 @@ *> A is DOUBLE PRECISION array, dimension (LDA,N) *> On entry, the M-by-N matrix A. *> On exit : -*> If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C' : -*> If INFO .EQ. 0 : +*> If JOBU = 'U' .OR. JOBU = 'C' : +*> If INFO = 0 : *> RANKA orthonormal columns of U are returned in the *> leading RANKA columns of the array A. Here RANKA <= N *> is the number of computed singular values of A that are @@ -129,9 +129,9 @@ *> in the array WORK as RANKA=NINT(WORK(2)). Also see the *> descriptions of SVA and WORK. The computed columns of U *> are mutually numerically orthogonal up to approximately -*> TOL=DSQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'), +*> TOL=DSQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'), *> see the description of JOBU. -*> If INFO .GT. 0 : +*> If INFO > 0 : *> the procedure DGESVJ did not converge in the given number *> of iterations (sweeps). In that case, the computed *> columns of U may not be orthogonal up to TOL. The output @@ -140,8 +140,8 @@ *> input matrix A in the sense that the residual *> ||A-SCALE*U*SIGMA*V^T||_2 / ||A||_2 is small. *> -*> If JOBU .EQ. 'N' : -*> If INFO .EQ. 0 : +*> If JOBU = 'N' : +*> If INFO = 0 : *> Note that the left singular vectors are 'for free' in the *> one-sided Jacobi SVD algorithm. However, if only the *> singular values are needed, the level of numerical @@ -150,7 +150,7 @@ *> numerically orthogonal up to approximately M*EPS. Thus, *> on exit, A contains the columns of U scaled with the *> corresponding singular values. -*> If INFO .GT. 0 : +*> If INFO > 0 : *> the procedure DGESVJ did not converge in the given number *> of iterations (sweeps). *> \endverbatim @@ -165,9 +165,9 @@ *> \verbatim *> SVA is DOUBLE PRECISION array, dimension (N) *> On exit : -*> If INFO .EQ. 0 : +*> If INFO = 0 : *> depending on the value SCALE = WORK(1), we have: -*> If SCALE .EQ. ONE : +*> If SCALE = ONE : *> SVA(1:N) contains the computed singular values of A. *> During the computation SVA contains the Euclidean column *> norms of the iterated matrices in the array A. @@ -175,7 +175,7 @@ *> The singular values of A are SCALE*SVA(1:N), and this *> factored representation is due to the fact that some of the *> singular values of A might underflow or overflow. -*> If INFO .GT. 0 : +*> If INFO > 0 : *> the procedure DGESVJ did not converge in the given number of *> iterations (sweeps) and SCALE*SVA(1:N) may not be accurate. *> \endverbatim @@ -183,7 +183,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then the product of Jacobi rotations in DGESVJ +*> If JOBV = 'A', then the product of Jacobi rotations in DGESVJ *> is applied to the first MV rows of V. See the description of JOBV. *> \endverbatim *> @@ -201,16 +201,16 @@ *> \param[in] LDV *> \verbatim *> LDV is INTEGER -*> The leading dimension of the array V, LDV .GE. 1. -*> If JOBV .EQ. 'V', then LDV .GE. max(1,N). -*> If JOBV .EQ. 'A', then LDV .GE. max(1,MV) . +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V', then LDV >= max(1,N). +*> If JOBV = 'A', then LDV >= max(1,MV) . *> \endverbatim *> *> \param[in,out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (LWORK) *> On entry : -*> If JOBU .EQ. 'C' : +*> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. *> The process stops if all columns of A are mutually *> orthogonal up to CTOL*EPS, EPS=DLAMCH('E'). @@ -230,7 +230,7 @@ *> WORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep. *> This is useful information in cases when DGESVJ did *> not converge, as it can be used to estimate whether -*> the output is stil useful and for post festum analysis. +*> the output is still useful and for post festum analysis. *> WORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. @@ -245,9 +245,9 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value -*> > 0 : DGESVJ did not converge in the maximal allowed number (30) +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value +*> > 0: DGESVJ did not converge in the maximal allowed number (30) *> of sweeps. The output may still be useful. See the *> description of WORK. *> \endverbatim diff --git a/lapack-netlib/SRC/dgesvxx.f b/lapack-netlib/SRC/dgesvxx.f index afcd05d8e..21b56f61c 100644 --- a/lapack-netlib/SRC/dgesvxx.f +++ b/lapack-netlib/SRC/dgesvxx.f @@ -411,7 +411,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -447,14 +447,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension (NPARAMS) -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -462,9 +462,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/dgetc2.f b/lapack-netlib/SRC/dgetc2.f index 0896a7013..5bf5b890f 100644 --- a/lapack-netlib/SRC/dgetc2.f +++ b/lapack-netlib/SRC/dgetc2.f @@ -85,7 +85,7 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> > 0: if INFO = k, U(k, k) is likely to produce owerflow if +*> > 0: if INFO = k, U(k, k) is likely to produce overflow if *> we try to solve for x in Ax = b. So U is perturbed to *> avoid the overflow. *> \endverbatim diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index 3b44a40ab..dfc72c8b2 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -1,3 +1,5 @@ +*> \brief \b DGETSLS +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/dggesx.f b/lapack-netlib/SRC/dggesx.f index 47022fbdf..0e57d636e 100644 --- a/lapack-netlib/SRC/dggesx.f +++ b/lapack-netlib/SRC/dggesx.f @@ -131,10 +131,10 @@ *> \verbatim *> SENSE is CHARACTER*1 *> Determines which reciprocal condition numbers are computed. -*> = 'N' : None are computed; -*> = 'E' : Computed for average of selected eigenvalues only; -*> = 'V' : Computed for selected deflating subspaces only; -*> = 'B' : Computed for both. +*> = 'N': None are computed; +*> = 'E': Computed for average of selected eigenvalues only; +*> = 'V': Computed for selected deflating subspaces only; +*> = 'B': Computed for both. *> If SENSE = 'E', 'V', or 'B', SORT must equal 'S'. *> \endverbatim *> diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f index 4fd38d37e..318e369fd 100644 --- a/lapack-netlib/SRC/dgsvj0.f +++ b/lapack-netlib/SRC/dgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is DOUBLE PRECISION array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multipled by a *> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim @@ -136,8 +136,8 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -157,7 +157,7 @@ *> TOL is DOUBLE PRECISION *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if DABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if DABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -175,14 +175,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: @@ -1045,7 +1045,7 @@ 1993 CONTINUE * end i=1:NSWEEP loop -* #:) Reaching this point means that the procedure has comleted the given +* #:) Reaching this point means that the procedure has completed the given * number of iterations. INFO = NSWEEP - 1 GO TO 1995 diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f index 376682c7f..35a93619f 100644 --- a/lapack-netlib/SRC/dgsvj1.f +++ b/lapack-netlib/SRC/dgsvj1.f @@ -61,7 +61,7 @@ *> In terms of the columns of A, the first N1 columns are rotated 'against' *> the remaining N-N1 columns, trying to increase the angle between the *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is -*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter. +*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter. *> The number of sweeps is given in NSWEEP and the orthogonality threshold *> is given in TOL. *> \endverbatim @@ -147,27 +147,27 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a -*> sequence of Jacobi rotations. -*> If JOBV = 'N', then MV is not referenced. +*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> sequence of Jacobi rotations. +*> If JOBV = 'N', then MV is not referenced. *> \endverbatim *> *> \param[in,out] V *> \verbatim *> V is DOUBLE PRECISION array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a -*> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a -*> sequence of Jacobi rotations. -*> If JOBV = 'N', then V is not referenced. +*> If JOBV = 'V', then N rows of V are post-multipled by a +*> sequence of Jacobi rotations. +*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> sequence of Jacobi rotations. +*> If JOBV = 'N', then V is not referenced. *> \endverbatim *> *> \param[in] LDV *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -187,7 +187,7 @@ *> TOL is DOUBLE PRECISION *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if DABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if DABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -205,14 +205,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f index 4444b955f..b4fc3af90 100644 --- a/lapack-netlib/SRC/dhseqr.f +++ b/lapack-netlib/SRC/dhseqr.f @@ -70,7 +70,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -87,7 +87,7 @@ *> set by a previous call to DGEBAL, and then passed to ZGEHRD *> when the matrix output by DGEBAL is reduced to Hessenberg *> form. Otherwise ILO and IHI should be set to 1 and N -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -100,20 +100,20 @@ *> (the Schur form); 2-by-2 diagonal blocks (corresponding to *> complex conjugate pairs of eigenvalues) are returned in *> standard form, with H(i,i) = H(i+1,i+1) and -*> H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the +*> H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and JOB = 'E', the *> contents of H are unspecified on exit. (The output value of -*> H when INFO.GT.0 is given under the description of INFO +*> H when INFO > 0 is given under the description of INFO *> below.) *> *> Unlike earlier versions of DHSEQR, this subroutine may -*> explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1 +*> explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1 *> or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] WR @@ -128,8 +128,8 @@ *> The real and imaginary parts, respectively, of the computed *> eigenvalues. If two eigenvalues are computed as a complex *> conjugate pair, they are stored in consecutive elements of -*> WR and WI, say the i-th and (i+1)th, with WI(i) .GT. 0 and -*> WI(i+1) .LT. 0. If JOB = 'S', the eigenvalues are stored in +*> WR and WI, say the i-th and (i+1)th, with WI(i) > 0 and +*> WI(i+1) < 0. If JOB = 'S', the eigenvalues are stored in *> the same order as on the diagonal of the Schur form returned *> in H, with WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 *> diagonal block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and @@ -148,7 +148,7 @@ *> if INFO = 0, Z contains Q*Z. *> Normally Q is the orthogonal matrix generated by DORGHR *> after the call to DGEHRD which formed the Hessenberg matrix -*> H. (The output value of Z when INFO.GT.0 is given under +*> H. (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -156,7 +156,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if COMPZ = 'I' or -*> COMPZ = 'V', then LDZ.GE.MAX(1,N). Otherwize, LDZ.GE.1. +*> COMPZ = 'V', then LDZ >= MAX(1,N). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -169,7 +169,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient and delivers very good and sometimes *> optimal performance. However, LWORK as large as 11*N *> may be required for optimal performance. A workspace @@ -187,21 +187,21 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .LT. 0: if INFO = -i, the i-th argument had an illegal +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal *> value -*> .GT. 0: if INFO = i, DHSEQR failed to compute all of +*> > 0: if INFO = i, DHSEQR failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and JOB = 'E', then on exit, the +*> If INFO > 0 and JOB = 'E', then on exit, the *> remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and JOB = 'S', then on exit +*> If INFO > 0 and JOB = 'S', then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -209,19 +209,19 @@ *> value of H is upper Hessenberg and quasi-triangular *> in rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and COMPZ = 'V', then on exit +*> If INFO > 0 and COMPZ = 'V', then on exit *> *> (final value of Z) = (initial value of Z)*U *> *> where U is the orthogonal matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'I', then on exit +*> If INFO > 0 and COMPZ = 'I', then on exit *> (final value of Z) = U *> where U is the orthogonal matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'N', then Z is not +*> If INFO > 0 and COMPZ = 'N', then Z is not *> accessed. *> \endverbatim * @@ -261,8 +261,8 @@ *> This depends on ILO, IHI and NS. NS is the *> number of simultaneous shifts returned *> by ILAENV(ISPEC=15). (See ISPEC=15 below.) -*> The default for (IHI-ILO+1).LE.500 is NS. -*> The default for (IHI-ILO+1).GT.500 is 3*NS/2. +*> The default for (IHI-ILO+1) <= 500 is NS. +*> The default for (IHI-ILO+1) > 500 is 3*NS/2. *> *> ISPEC=14: Nibble crossover point. (See IPARMQ for *> details.) Default: 14% of deflation window @@ -341,8 +341,8 @@ PARAMETER ( NTINY = 11 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare DLAHQR failure. NL .GT. NTINY = 11 is -* . required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom- +* . through a rare DLAHQR failure. NL > NTINY = 11 is +* . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 * . deflation window. ==== diff --git a/lapack-netlib/SRC/dla_gbrcond.f b/lapack-netlib/SRC/dla_gbrcond.f index e9713c9ca..c9eebcbea 100644 --- a/lapack-netlib/SRC/dla_gbrcond.f +++ b/lapack-netlib/SRC/dla_gbrcond.f @@ -141,13 +141,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (5*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.f b/lapack-netlib/SRC/dla_gbrfsx_extended.f index 12b2a32a4..282a63f1c 100644 --- a/lapack-netlib/SRC/dla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/dla_gbrfsx_extended.f @@ -66,19 +66,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -270,7 +270,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/dla_gercond.f b/lapack-netlib/SRC/dla_gercond.f index aa93ca5a4..6f7d70a6a 100644 --- a/lapack-netlib/SRC/dla_gercond.f +++ b/lapack-netlib/SRC/dla_gercond.f @@ -123,13 +123,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (3*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.f b/lapack-netlib/SRC/dla_gerfsx_extended.f index 082f810f0..4cb9ef4f9 100644 --- a/lapack-netlib/SRC/dla_gerfsx_extended.f +++ b/lapack-netlib/SRC/dla_gerfsx_extended.f @@ -64,19 +64,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -256,7 +256,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERRS_C is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERRS_C is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERRS_C(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/dla_porcond.f b/lapack-netlib/SRC/dla_porcond.f index 498e707e3..b2f9c4b1e 100644 --- a/lapack-netlib/SRC/dla_porcond.f +++ b/lapack-netlib/SRC/dla_porcond.f @@ -113,13 +113,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (3*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/dla_porfsx_extended.f b/lapack-netlib/SRC/dla_porfsx_extended.f index 8c0d6bebd..ece9b00ed 100644 --- a/lapack-netlib/SRC/dla_porfsx_extended.f +++ b/lapack-netlib/SRC/dla_porfsx_extended.f @@ -65,11 +65,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -246,7 +246,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/dla_porpvgrw.f b/lapack-netlib/SRC/dla_porpvgrw.f index 4fe1a1922..8a6f9e1a7 100644 --- a/lapack-netlib/SRC/dla_porpvgrw.f +++ b/lapack-netlib/SRC/dla_porpvgrw.f @@ -85,7 +85,7 @@ *> The leading dimension of the array AF. LDAF >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/dla_syrcond.f b/lapack-netlib/SRC/dla_syrcond.f index 91d557145..23ed82588 100644 --- a/lapack-netlib/SRC/dla_syrcond.f +++ b/lapack-netlib/SRC/dla_syrcond.f @@ -119,13 +119,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (3*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.f b/lapack-netlib/SRC/dla_syrfsx_extended.f index f54d15194..b390600c0 100644 --- a/lapack-netlib/SRC/dla_syrfsx_extended.f +++ b/lapack-netlib/SRC/dla_syrfsx_extended.f @@ -67,11 +67,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -255,7 +255,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/dla_syrpvgrw.f b/lapack-netlib/SRC/dla_syrpvgrw.f index c2e5cb018..5ba03093b 100644 --- a/lapack-netlib/SRC/dla_syrpvgrw.f +++ b/lapack-netlib/SRC/dla_syrpvgrw.f @@ -101,7 +101,7 @@ *> as determined by DSYTRF. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/dla_wwaddw.f b/lapack-netlib/SRC/dla_wwaddw.f index 99a86c553..4f50540d6 100644 --- a/lapack-netlib/SRC/dla_wwaddw.f +++ b/lapack-netlib/SRC/dla_wwaddw.f @@ -36,7 +36,7 @@ *> DLA_WWADDW adds a vector W into a doubled-single vector (X, Y). *> *> This works for all extant IBM's hex and binary floating point -*> arithmetics, but not for decimal. +*> arithmetic, but not for decimal. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dlaed4.f b/lapack-netlib/SRC/dlaed4.f index e7dc839df..033438d73 100644 --- a/lapack-netlib/SRC/dlaed4.f +++ b/lapack-netlib/SRC/dlaed4.f @@ -82,7 +82,7 @@ *> \param[out] DELTA *> \verbatim *> DELTA is DOUBLE PRECISION array, dimension (N) -*> If N .GT. 2, DELTA contains (D(j) - lambda_I) in its j-th +*> If N > 2, DELTA contains (D(j) - lambda_I) in its j-th *> component. If N = 1, then DELTA(1) = 1. If N = 2, see DLAED5 *> for detail. The vector DELTA contains the information necessary *> to construct the eigenvectors by DLAED3 and DLAED9. diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f index c053347b1..f64679dc0 100644 --- a/lapack-netlib/SRC/dlaed8.f +++ b/lapack-netlib/SRC/dlaed8.f @@ -353,7 +353,7 @@ Z( I ) = W( INDX( I ) ) 40 CONTINUE * -* Calculate the allowable deflation tolerence +* Calculate the allowable deflation tolerance * IMAX = IDAMAX( N, Z, 1 ) JMAX = IDAMAX( N, D, 1 ) diff --git a/lapack-netlib/SRC/dlagtf.f b/lapack-netlib/SRC/dlagtf.f index 4b257c64f..b92c84f39 100644 --- a/lapack-netlib/SRC/dlagtf.f +++ b/lapack-netlib/SRC/dlagtf.f @@ -125,7 +125,7 @@ *> then IN(k) = 1, otherwise IN(k) = 0. The element IN(n) *> returns the smallest positive integer j such that *> -*> abs( u(j,j) ).le. norm( (T - lambda*I)(j) )*TOL, +*> abs( u(j,j) ) <= norm( (T - lambda*I)(j) )*TOL, *> *> where norm( A(j) ) denotes the sum of the absolute values of *> the jth row of the matrix A. If no such j exists then IN(n) @@ -137,8 +137,8 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit -*> .lt. 0: if INFO = -k, the kth argument had an illegal value +*> = 0: successful exit +*> < 0: if INFO = -k, the kth argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlagts.f b/lapack-netlib/SRC/dlagts.f index 926075827..cbd35ae14 100644 --- a/lapack-netlib/SRC/dlagts.f +++ b/lapack-netlib/SRC/dlagts.f @@ -122,12 +122,12 @@ *> \param[in,out] TOL *> \verbatim *> TOL is DOUBLE PRECISION -*> On entry, with JOB .lt. 0, TOL should be the minimum +*> On entry, with JOB < 0, TOL should be the minimum *> perturbation to be made to very small diagonal elements of U. *> TOL should normally be chosen as about eps*norm(U), where eps *> is the relative machine precision, but if TOL is supplied as *> non-positive, then it is reset to eps*max( abs( u(i,j) ) ). -*> If JOB .gt. 0 then TOL is not referenced. +*> If JOB > 0 then TOL is not referenced. *> *> On exit, TOL is changed as described above, only if TOL is *> non-positive on entry. Otherwise TOL is unchanged. @@ -136,14 +136,14 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit -*> .lt. 0: if INFO = -i, the i-th argument had an illegal value -*> .gt. 0: overflow would occur when computing the INFO(th) -*> element of the solution vector x. This can only occur -*> when JOB is supplied as positive and either means -*> that a diagonal element of U is very small, or that -*> the elements of the right-hand side vector y are very -*> large. +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: overflow would occur when computing the INFO(th) +*> element of the solution vector x. This can only occur +*> when JOB is supplied as positive and either means +*> that a diagonal element of U is very small, or that +*> the elements of the right-hand side vector y are very +*> large. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlahqr.f b/lapack-netlib/SRC/dlahqr.f index f7365d21e..e863829ec 100644 --- a/lapack-netlib/SRC/dlahqr.f +++ b/lapack-netlib/SRC/dlahqr.f @@ -150,26 +150,26 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: If INFO = i, DLAHQR failed to compute all the +*> = 0: successful exit +*> > 0: If INFO = i, DLAHQR failed to compute all the *> eigenvalues ILO to IHI in a total of 30 iterations *> per eigenvalue; elements i+1:ihi of WR and WI *> contain those eigenvalues which have been *> successfully computed. *> -*> If INFO .GT. 0 and WANTT is .FALSE., then on exit, +*> If INFO > 0 and WANTT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the *> eigenvalues of the upper Hessenberg matrix rows -*> and columns ILO thorugh INFO of the final, output +*> and columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> (*) (initial value of H)*U = U*(final value of H) -*> where U is an orthognal matrix. The final +*> where U is an orthogonal matrix. The final *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> (final value of Z) = (initial value of Z)*U *> where U is the orthogonal matrix in (*) *> (regardless of the value of WANTT.) diff --git a/lapack-netlib/SRC/dlaln2.f b/lapack-netlib/SRC/dlaln2.f index a094b737b..0c94ea308 100644 --- a/lapack-netlib/SRC/dlaln2.f +++ b/lapack-netlib/SRC/dlaln2.f @@ -49,7 +49,7 @@ *> the first column of each being the real part and the second *> being the imaginary part. *> -*> "s" is a scaling factor (.LE. 1), computed by DLALN2, which is +*> "s" is a scaling factor (<= 1), computed by DLALN2, which is *> so chosen that X can be computed without overflow. X is further *> scaled if necessary to assure that norm(ca A - w D)*norm(X) is less *> than overflow. diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f index 19e32f888..306c3d3de 100644 --- a/lapack-netlib/SRC/dlamswlq.f +++ b/lapack-netlib/SRC/dlamswlq.f @@ -1,3 +1,4 @@ +*> \brief \b DLAMSWLQ * * Definition: * =========== diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f index 6af89d28e..41a067780 100644 --- a/lapack-netlib/SRC/dlamtsqr.f +++ b/lapack-netlib/SRC/dlamtsqr.f @@ -1,3 +1,4 @@ +*> \brief \b DLAMTSQR * * Definition: * =========== diff --git a/lapack-netlib/SRC/dlangb.f b/lapack-netlib/SRC/dlangb.f index 078573b87..0c4f938f7 100644 --- a/lapack-netlib/SRC/dlangb.f +++ b/lapack-netlib/SRC/dlangb.f @@ -129,6 +129,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER KL, KU, LDAB, N @@ -139,22 +140,24 @@ * * ===================================================================== * -* * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER I, J, K, L - DOUBLE PRECISION SCALE, SUM, VALUE, TEMP + DOUBLE PRECISION SUM, VALUE, TEMP * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT * .. @@ -206,15 +209,22 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N L = MAX( 1, J-KU ) K = KU + 1 - J + L - CALL DLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANGB = VALUE diff --git a/lapack-netlib/SRC/dlange.f b/lapack-netlib/SRC/dlange.f index 9dbf45e81..6b32fbefd 100644 --- a/lapack-netlib/SRC/dlange.f +++ b/lapack-netlib/SRC/dlange.f @@ -119,6 +119,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, M, N @@ -135,10 +136,13 @@ * .. * .. Local Scalars .. INTEGER I, J - DOUBLE PRECISION SCALE, SUM, VALUE, TEMP + DOUBLE PRECISION SUM, VALUE, TEMP +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Subroutines .. - EXTERNAL DLASSQ + EXTERNAL DLASSQ, DCOMBSSQ * .. * .. External Functions .. LOGICAL LSAME, DISNAN @@ -194,13 +198,19 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL DLASSQ( M, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANGE = VALUE diff --git a/lapack-netlib/SRC/dlanhs.f b/lapack-netlib/SRC/dlanhs.f index 691dbc21e..a859d2216 100644 --- a/lapack-netlib/SRC/dlanhs.f +++ b/lapack-netlib/SRC/dlanhs.f @@ -113,6 +113,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, N @@ -129,15 +130,18 @@ * .. * .. Local Scalars .. INTEGER I, J - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT * .. @@ -188,13 +192,20 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL DLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( MIN( N, J+1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANHS = VALUE diff --git a/lapack-netlib/SRC/dlansb.f b/lapack-netlib/SRC/dlansb.f index 4ccf5f27e..a82dc41b1 100644 --- a/lapack-netlib/SRC/dlansb.f +++ b/lapack-netlib/SRC/dlansb.f @@ -134,6 +134,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER K, LDAB, N @@ -150,15 +151,18 @@ * .. * .. Local Scalars .. INTEGER I, J, L - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT * .. @@ -225,29 +229,47 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( K.GT.0 ) THEN IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL DLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE L = K + 1 ELSE DO 120 J = 1, N - 1 - CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE L = 1 END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) ELSE L = 1 END IF - CALL DLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANSB = VALUE diff --git a/lapack-netlib/SRC/dlansp.f b/lapack-netlib/SRC/dlansp.f index a1829db75..b6ad1ffcf 100644 --- a/lapack-netlib/SRC/dlansp.f +++ b/lapack-netlib/SRC/dlansp.f @@ -119,6 +119,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER N @@ -135,15 +136,18 @@ * .. * .. Local Scalars .. INTEGER I, J, K - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. @@ -217,31 +221,48 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE K = 2 IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL DLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + J 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL DLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* K = 1 + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 I = 1, N IF( AP( K ).NE.ZERO ) THEN ABSA = ABS( AP( K ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( LSAME( UPLO, 'U' ) ) THEN @@ -250,7 +271,8 @@ K = K + N - I + 1 END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANSP = VALUE diff --git a/lapack-netlib/SRC/dlansy.f b/lapack-netlib/SRC/dlansy.f index 2372fce0a..87d514c11 100644 --- a/lapack-netlib/SRC/dlansy.f +++ b/lapack-netlib/SRC/dlansy.f @@ -127,6 +127,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER LDA, N @@ -143,15 +144,18 @@ * .. * .. Local Scalars .. INTEGER I, J - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. @@ -216,21 +220,39 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL DLASSQ( J-1, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL DLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE END IF - SUM = 2*SUM - CALL DLASSQ( N, A, LDA+1, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANSY = VALUE diff --git a/lapack-netlib/SRC/dlantb.f b/lapack-netlib/SRC/dlantb.f index 3d2bfe7e4..0d46f6cc8 100644 --- a/lapack-netlib/SRC/dlantb.f +++ b/lapack-netlib/SRC/dlantb.f @@ -145,6 +145,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER K, LDAB, N @@ -162,15 +163,18 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, L - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT * .. @@ -311,46 +315,61 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 280 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL DLASSQ( MIN( J-1, K ), - $ AB( MAX( K+2-J, 1 ), J ), 1, SCALE, - $ SUM ) + $ AB( MAX( K+2-J, 1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 280 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 290 J = 1, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL DLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 300 J = 1, N - 1 - CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 310 J = 1, N - CALL DLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANTB = VALUE diff --git a/lapack-netlib/SRC/dlantp.f b/lapack-netlib/SRC/dlantp.f index f84a9e9d7..a7b89dec7 100644 --- a/lapack-netlib/SRC/dlantp.f +++ b/lapack-netlib/SRC/dlantp.f @@ -129,6 +129,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER N @@ -146,15 +147,18 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, K - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. @@ -306,45 +310,64 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 280 J = 2, N - CALL DLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( J-1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + J 280 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 290 J = 1, N - CALL DLASSQ( J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + J 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 300 J = 1, N - 1 - CALL DLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( N-J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 300 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 310 J = 1, N - CALL DLASSQ( N-J+1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( N-J+1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANTP = VALUE diff --git a/lapack-netlib/SRC/dlantr.f b/lapack-netlib/SRC/dlantr.f index 8585b2f68..adc7da4c4 100644 --- a/lapack-netlib/SRC/dlantr.f +++ b/lapack-netlib/SRC/dlantr.f @@ -146,6 +146,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER LDA, M, N @@ -163,15 +164,18 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL DLASSQ +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. +* .. External Subroutines .. + EXTERNAL DLASSQ, DCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT * .. @@ -281,7 +285,7 @@ END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - DO 210 I = 1, N + DO 210 I = 1, MIN( M, N ) WORK( I ) = ONE 210 CONTINUE DO 220 I = N + 1, M @@ -311,38 +315,56 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 290 J = 2, N - CALL DLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( MIN( M, J-1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 300 J = 1, N - CALL DLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( MIN( M, J ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 310 J = 1, N - CALL DLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 320 J = 1, N - CALL DLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL DLASSQ( M-J+1, A( J, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 320 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * DLANTR = VALUE diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index 91fa14ff2..d68481f7e 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -161,7 +161,6 @@ IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO - GO TO 10 * ELSE IF( B.EQ.ZERO ) THEN * @@ -174,12 +173,12 @@ A = TEMP B = -C C = ZERO - GO TO 10 +* ELSE IF( ( A-D ).EQ.ZERO .AND. SIGN( ONE, B ).NE.SIGN( ONE, C ) ) $ THEN CS = ONE SN = ZERO - GO TO 10 +* ELSE * TEMP = A - D @@ -207,6 +206,7 @@ SN = C / TAU B = B - C C = ZERO +* ELSE * * Complex eigenvalues, or real (almost) equal eigenvalues. @@ -268,8 +268,6 @@ END IF * END IF -* - 10 CONTINUE * * Store eigenvalues in (RT1R,RT1I) and (RT2R,RT2I). * diff --git a/lapack-netlib/SRC/dlaorhr_col_getrfnp.f b/lapack-netlib/SRC/dlaorhr_col_getrfnp.f new file mode 100644 index 000000000..6a7c629e8 --- /dev/null +++ b/lapack-netlib/SRC/dlaorhr_col_getrfnp.f @@ -0,0 +1,248 @@ +*> \brief \b DLAORHR_COL_GETRFNP +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAORHR_COL_GETRFNP + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAORHR_COL_GETRFNP computes the modified LU factorization without +*> pivoting of a real general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is +*> at least one in absolute value (so that division-by-zero not +*> not possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine DORHR_COL. In DORHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the blocked right-looking version of the algorithm, +*> calling Level 3 BLAS to update the submatrix. To factorize a block, +*> this routine calls the recursive routine DLAORHR_COL_GETRFNP2. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is DOUBLE PRECISION array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can +*> be only plus or minus one. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup doubleGEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLAORHR_COL_GETRFNP2, DTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLAORHR_COL_GETRFNP', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + + NB = ILAENV( 1, 'DLAORHR_COL_GETRFNP', ' ', M, N, -1, -1 ) + + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL DLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + ELSE +* +* Use blocked code. +* + DO J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks. +* + CALL DLAORHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA, + $ D( J ), IINFO ) +* + IF( J+JB.LE.N ) THEN +* +* Compute block row of U. +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL DGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + END DO + END IF + RETURN +* +* End of DLAORHR_COL_GETRFNP +* + END diff --git a/lapack-netlib/SRC/dlaorhr_col_getrfnp2.f b/lapack-netlib/SRC/dlaorhr_col_getrfnp2.f new file mode 100644 index 000000000..f7781f2e5 --- /dev/null +++ b/lapack-netlib/SRC/dlaorhr_col_getrfnp2.f @@ -0,0 +1,305 @@ +*> \brief \b DLAORHR_COL_GETRFNP2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAORHR_GETRF2NP + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE DLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAORHR_COL_GETRFNP2 computes the modified LU factorization without +*> pivoting of a real general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is at +*> least one in absolute value (so that division-by-zero not +*> possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine DORHR_COL. In DORHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the recursive version of the LU factorization algorithm. +*> Denote A - S by B. The algorithm divides the matrix B into four +*> submatrices: +*> +*> [ B11 | B12 ] where B11 is n1 by n1, +*> B = [ -----|----- ] B21 is (m-n1) by n1, +*> [ B21 | B22 ] B12 is n1 by n2, +*> B22 is (m-n1) by n2, +*> with n1 = min(m,n)/2, n2 = n-n1. +*> +*> +*> The subroutine calls itself to factor B11, solves for B21, +*> solves for B12, updates B22, then calls itself to factor B22. +*> +*> For more details on the recursive LU algorithm, see [2]. +*> +*> DLAORHR_COL_GETRFNP2 is called to factorize a block by the blocked +*> routine DLAORHR_COL_GETRFNP, which uses blocked code calling +*. Level 3 BLAS to update the submatrix. However, DLAORHR_COL_GETRFNP2 +*> is self-sufficient and can be used without DLAORHR_COL_GETRFNP. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> +*> [2] "Recursion leads to automatic variable blocking for dense linear +*> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev., +*> vol. 41, no. 6, pp. 737-755, 1997. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is DOUBLE PRECISION array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can +*> be only plus or minus one. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup doubleGEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + RECURSIVE SUBROUTINE DLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION SFMIN + INTEGER I, IINFO, N1, N2 +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DSCAL, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DSIGN, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLAORHR_COL_GETRFNP2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN + + IF ( M.EQ.1 ) THEN +* +* One row case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = -DSIGN( ONE, A( 1, 1 ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* + ELSE IF( N.EQ.1 ) THEN +* +* One column case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = -DSIGN( ONE, A( 1, 1 ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* +* Scale the elements 2:M of the column +* +* Determine machine safe minimum +* + SFMIN = DLAMCH('S') +* +* Construct the subdiagonal elements of L +* + IF( ABS( A( 1, 1 ) ) .GE. SFMIN ) THEN + CALL DSCAL( M-1, ONE / A( 1, 1 ), A( 2, 1 ), 1 ) + ELSE + DO I = 2, M + A( I, 1 ) = A( I, 1 ) / A( 1, 1 ) + END DO + END IF +* + ELSE +* +* Divide the matrix B into four submatrices +* + N1 = MIN( M, N ) / 2 + N2 = N-N1 + +* +* Factor B11, recursive call +* + CALL DLAORHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO ) +* +* Solve for B21 +* + CALL DTRSM( 'R', 'U', 'N', 'N', M-N1, N1, ONE, A, LDA, + $ A( N1+1, 1 ), LDA ) +* +* Solve for B12 +* + CALL DTRSM( 'L', 'L', 'N', 'U', N1, N2, ONE, A, LDA, + $ A( 1, N1+1 ), LDA ) +* +* Update B22, i.e. compute the Schur complement +* B22 := B22 - B21*B12 +* + CALL DGEMM( 'N', 'N', M-N1, N2, N1, -ONE, A( N1+1, 1 ), LDA, + $ A( 1, N1+1 ), LDA, ONE, A( N1+1, N1+1 ), LDA ) +* +* Factor B22, recursive call +* + CALL DLAORHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA, + $ D( N1+1 ), IINFO ) +* + END IF + RETURN +* +* End of DLAORHR_COL_GETRFNP2 +* + END diff --git a/lapack-netlib/SRC/dlaqps.f b/lapack-netlib/SRC/dlaqps.f index 395d8e0b1..0009de951 100644 --- a/lapack-netlib/SRC/dlaqps.f +++ b/lapack-netlib/SRC/dlaqps.f @@ -127,7 +127,7 @@ *> \param[in,out] AUXV *> \verbatim *> AUXV is DOUBLE PRECISION array, dimension (NB) -*> Auxiliar vector. +*> Auxiliary vector. *> \endverbatim *> *> \param[in,out] F diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f index 247d4ef30..f362c096c 100644 --- a/lapack-netlib/SRC/dlaqr0.f +++ b/lapack-netlib/SRC/dlaqr0.f @@ -67,7 +67,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -79,12 +79,12 @@ *> \verbatim *> IHI is INTEGER *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to DGEBAL, and then passed to DGEHRD when the *> matrix output by DGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -97,19 +97,19 @@ *> decomposition (the Schur form); 2-by-2 diagonal blocks *> (corresponding to complex conjugate pairs of eigenvalues) *> are returned in standard form, with H(i,i) = H(i+1,i+1) -*> and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is +*> and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] WR @@ -125,7 +125,7 @@ *> and WI(ILO:IHI). If two eigenvalues are computed as a *> complex conjugate pair, they are stored in consecutive *> elements of WR and WI, say the i-th and (i+1)th, with -*> WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then +*> WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then *> the eigenvalues are stored in the same order as on the *> diagonal of the Schur form returned in H, with *> WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal @@ -143,7 +143,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -153,7 +153,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -161,7 +161,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -174,7 +174,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -190,19 +190,19 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, DLAQR0 failed to compute all of +*> = 0: successful exit +*> > 0: if INFO = i, DLAQR0 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -210,7 +210,7 @@ *> value of H is upper Hessenberg and quasi-triangular *> in rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -218,7 +218,7 @@ *> where U is the orthogonal matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -678,7 +678,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/dlaqr1.f b/lapack-netlib/SRC/dlaqr1.f index 795b072ab..4ccf997e7 100644 --- a/lapack-netlib/SRC/dlaqr1.f +++ b/lapack-netlib/SRC/dlaqr1.f @@ -69,7 +69,7 @@ *> \verbatim *> LDH is INTEGER *> The leading dimension of H as declared in -*> the calling procedure. LDH.GE.N +*> the calling procedure. LDH >= N *> \endverbatim *> *> \param[in] SR1 diff --git a/lapack-netlib/SRC/dlaqr2.f b/lapack-netlib/SRC/dlaqr2.f index 431b3f123..01fdf3046 100644 --- a/lapack-netlib/SRC/dlaqr2.f +++ b/lapack-netlib/SRC/dlaqr2.f @@ -103,7 +103,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -121,7 +121,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -133,7 +133,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -149,7 +149,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -194,13 +194,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -212,14 +212,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -231,7 +231,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/dlaqr3.f b/lapack-netlib/SRC/dlaqr3.f index aa23617c3..1dbf55c9e 100644 --- a/lapack-netlib/SRC/dlaqr3.f +++ b/lapack-netlib/SRC/dlaqr3.f @@ -100,7 +100,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -118,7 +118,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -130,7 +130,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -146,7 +146,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -191,13 +191,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -209,14 +209,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -228,7 +228,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f index 89b9b7f20..454bf9608 100644 --- a/lapack-netlib/SRC/dlaqr4.f +++ b/lapack-netlib/SRC/dlaqr4.f @@ -74,7 +74,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -86,12 +86,12 @@ *> \verbatim *> IHI is INTEGER *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to DGEBAL, and then passed to DGEHRD when the *> matrix output by DGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -104,19 +104,19 @@ *> decomposition (the Schur form); 2-by-2 diagonal blocks *> (corresponding to complex conjugate pairs of eigenvalues) *> are returned in standard form, with H(i,i) = H(i+1,i+1) -*> and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is +*> and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] WR @@ -132,7 +132,7 @@ *> and WI(ILO:IHI). If two eigenvalues are computed as a *> complex conjugate pair, they are stored in consecutive *> elements of WR and WI, say the i-th and (i+1)th, with -*> WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then +*> WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then *> the eigenvalues are stored in the same order as on the *> diagonal of the Schur form returned in H, with *> WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal @@ -150,7 +150,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -160,7 +160,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -168,7 +168,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -181,7 +181,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -197,19 +197,19 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, DLAQR4 failed to compute all of +*> = 0: successful exit +*> > 0: if INFO = i, DLAQR4 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -217,7 +217,7 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -225,7 +225,7 @@ *> where U is the orthogonal matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -677,7 +677,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index 5cc4eda1a..f58db9c89 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -133,7 +133,7 @@ *> \verbatim *> LDH is INTEGER *> LDH is the leading dimension of H just as declared in the -*> calling procedure. LDH.GE.MAX(1,N). +*> calling procedure. LDH >= MAX(1,N). *> \endverbatim *> *> \param[in] ILOZ @@ -145,7 +145,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N *> \endverbatim *> *> \param[in,out] Z @@ -161,7 +161,7 @@ *> \verbatim *> LDZ is INTEGER *> LDA is the leading dimension of Z just as declared in -*> the calling procedure. LDZ.GE.N. +*> the calling procedure. LDZ >= N. *> \endverbatim *> *> \param[out] V @@ -173,7 +173,7 @@ *> \verbatim *> LDV is INTEGER *> LDV is the leading dimension of V as declared in the -*> calling procedure. LDV.GE.3. +*> calling procedure. LDV >= 3. *> \endverbatim *> *> \param[out] U @@ -185,33 +185,14 @@ *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU.GE.3*NSHFTS-3. -*> \endverbatim -*> -*> \param[in] NH -*> \verbatim -*> NH is INTEGER -*> NH is the number of columns in array WH available for -*> workspace. NH.GE.1. -*> \endverbatim -*> -*> \param[out] WH -*> \verbatim -*> WH is DOUBLE PRECISION array, dimension (LDWH,NH) -*> \endverbatim -*> -*> \param[in] LDWH -*> \verbatim -*> LDWH is INTEGER -*> Leading dimension of WH just as declared in the -*> calling procedure. LDWH.GE.3*NSHFTS-3. +*> in the calling subroutine. LDU >= 3*NSHFTS-3. *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> NV is the number of rows in WV agailable for workspace. -*> NV.GE.1. +*> NV >= 1. *> \endverbatim *> *> \param[out] WV @@ -223,9 +204,28 @@ *> \verbatim *> LDWV is INTEGER *> LDWV is the leading dimension of WV as declared in the -*> in the calling subroutine. LDWV.GE.NV. +*> in the calling subroutine. LDWV >= NV. *> \endverbatim * +*> \param[in] NH +*> \verbatim +*> NH is INTEGER +*> NH is the number of columns in array WH available for +*> workspace. NH >= 1. +*> \endverbatim +*> +*> \param[out] WH +*> \verbatim +*> WH is DOUBLE PRECISION array, dimension (LDWH,NH) +*> \endverbatim +*> +*> \param[in] LDWH +*> \verbatim +*> LDWH is INTEGER +*> Leading dimension of WH just as declared in the +*> calling procedure. LDWH >= 3*NSHFTS-3. +*> \endverbatim +*> * Authors: * ======== * diff --git a/lapack-netlib/SRC/dlarfb.f b/lapack-netlib/SRC/dlarfb.f index 5b2cc2ba8..e63641213 100644 --- a/lapack-netlib/SRC/dlarfb.f +++ b/lapack-netlib/SRC/dlarfb.f @@ -92,6 +92,8 @@ *> K is INTEGER *> The order of the matrix T (= the number of elementary *> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. *> \endverbatim *> *> \param[in] V diff --git a/lapack-netlib/SRC/dlarfx.f b/lapack-netlib/SRC/dlarfx.f index 260d367d4..a9e4496f9 100644 --- a/lapack-netlib/SRC/dlarfx.f +++ b/lapack-netlib/SRC/dlarfx.f @@ -94,7 +94,7 @@ *> \param[in] LDC *> \verbatim *> LDC is INTEGER -*> The leading dimension of the array C. LDA >= (1,M). +*> The leading dimension of the array C. LDC >= (1,M). *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/dlarfy.f b/lapack-netlib/SRC/dlarfy.f index a0b0ebb31..3000b38bc 100644 --- a/lapack-netlib/SRC/dlarfy.f +++ b/lapack-netlib/SRC/dlarfy.f @@ -103,7 +103,7 @@ * *> \date December 2016 * -*> \ingroup double_eig +*> \ingroup doubleOTHERauxiliary * * ===================================================================== SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) diff --git a/lapack-netlib/SRC/dlarrb.f b/lapack-netlib/SRC/dlarrb.f index 2b6389e25..ddf3888b9 100644 --- a/lapack-netlib/SRC/dlarrb.f +++ b/lapack-netlib/SRC/dlarrb.f @@ -91,7 +91,7 @@ *> RTOL2 is DOUBLE PRECISION *> Tolerance for the convergence of the bisection intervals. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> where GAP is the (estimated) distance to the nearest *> eigenvalue. *> \endverbatim @@ -117,7 +117,7 @@ *> WGAP is DOUBLE PRECISION array, dimension (N-1) *> On input, the (estimated) gaps between consecutive *> eigenvalues of L D L^T, i.e., WGAP(I-OFFSET) is the gap between -*> eigenvalues I and I+1. Note that if IFIRST.EQ.ILAST +*> eigenvalues I and I+1. Note that if IFIRST = ILAST *> then WGAP(IFIRST-OFFSET) must be set to ZERO. *> On output, these gaps are refined. *> \endverbatim diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f index 0613efbc3..ce55442e2 100644 --- a/lapack-netlib/SRC/dlarre.f +++ b/lapack-netlib/SRC/dlarre.f @@ -150,7 +150,7 @@ *> RTOL2 is DOUBLE PRECISION *> Parameters for bisection. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> \endverbatim *> *> \param[in] SPLTOL diff --git a/lapack-netlib/SRC/dlarrj.f b/lapack-netlib/SRC/dlarrj.f index 097ba9f77..a4bfb210c 100644 --- a/lapack-netlib/SRC/dlarrj.f +++ b/lapack-netlib/SRC/dlarrj.f @@ -85,7 +85,7 @@ *> RTOL is DOUBLE PRECISION *> Tolerance for the convergence of the bisection intervals. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.RTOL*MAX(|LEFT|,|RIGHT|). +*> RIGHT-LEFT < RTOL*MAX(|LEFT|,|RIGHT|). *> \endverbatim *> *> \param[in] OFFSET diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f index cace17c0e..4a59a2bbf 100644 --- a/lapack-netlib/SRC/dlarrv.f +++ b/lapack-netlib/SRC/dlarrv.f @@ -149,7 +149,7 @@ *> RTOL2 is DOUBLE PRECISION *> Parameters for bisection. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> \endverbatim *> *> \param[in,out] W diff --git a/lapack-netlib/SRC/dlasd7.f b/lapack-netlib/SRC/dlasd7.f index e0ddedeb5..66f665cf8 100644 --- a/lapack-netlib/SRC/dlasd7.f +++ b/lapack-netlib/SRC/dlasd7.f @@ -400,7 +400,7 @@ VL( I ) = VLW( IDXI ) 50 CONTINUE * -* Calculate the allowable deflation tolerence +* Calculate the allowable deflation tolerance * EPS = DLAMCH( 'Epsilon' ) TOL = MAX( ABS( ALPHA ), ABS( BETA ) ) diff --git a/lapack-netlib/SRC/dlasr.f b/lapack-netlib/SRC/dlasr.f index 6059c6293..f707970e4 100644 --- a/lapack-netlib/SRC/dlasr.f +++ b/lapack-netlib/SRC/dlasr.f @@ -175,7 +175,7 @@ *> \verbatim *> A is DOUBLE PRECISION array, dimension (LDA,N) *> The M-by-N matrix A. On exit, A is overwritten by P*A if -*> SIDE = 'R' or by A*P**T if SIDE = 'L'. +*> SIDE = 'L' or by A*P**T if SIDE = 'R'. *> \endverbatim *> *> \param[in] LDA diff --git a/lapack-netlib/SRC/dlassq.f b/lapack-netlib/SRC/dlassq.f index 885395e3c..5922360f9 100644 --- a/lapack-netlib/SRC/dlassq.f +++ b/lapack-netlib/SRC/dlassq.f @@ -60,7 +60,7 @@ *> *> \param[in] X *> \verbatim -*> X is DOUBLE PRECISION array, dimension (N) +*> X is DOUBLE PRECISION array, dimension (1+(N-1)*INCX) *> The vector for which a scaled sum of squares is computed. *> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. *> \endverbatim diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f index 6e4ca20fd..619a1f1a2 100644 --- a/lapack-netlib/SRC/dlaswlq.f +++ b/lapack-netlib/SRC/dlaswlq.f @@ -1,3 +1,4 @@ +*> \brief \b DLASWLQ * * Definition: * =========== @@ -18,9 +19,20 @@ *> *> \verbatim *> -*> DLASWLQ computes a blocked Short-Wide LQ factorization of a -*> M-by-N matrix A, where N >= M: -*> A = L * Q +*> DLASWLQ computes a blocked Tall-Skinny LQ factorization of +*> a real M-by-N matrix A for M <= N: +*> +*> A = ( L 0 ) * Q, +*> +*> where: +*> +*> Q is a n-by-N orthogonal matrix, stored on exit in an implicit +*> form in the elements above the digonal of the array A and in +*> the elemenst of the array T; +*> L is an lower-triangular M-by-M matrix stored on exit in +*> the elements on and below the diagonal of the array A. +*> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored. +*> *> \endverbatim * * Arguments: @@ -150,7 +162,7 @@ SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, $ INFO) * -* -- LAPACK computational routine (version 3.7.1) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * June 2017 diff --git a/lapack-netlib/SRC/dlasyf_aa.f b/lapack-netlib/SRC/dlasyf_aa.f index 6b75e46e0..793537e04 100644 --- a/lapack-netlib/SRC/dlasyf_aa.f +++ b/lapack-netlib/SRC/dlasyf_aa.f @@ -284,8 +284,9 @@ * * Swap A(I1, I2+1:M) with A(I2, I2+1:M) * - CALL DSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, - $ A( J1+I2-1, I2+1 ), LDA ) + IF( I2.LT.M ) + $ CALL DSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, + $ A( J1+I2-1, I2+1 ), LDA ) * * Swap A(I1, I1) with A(I2,I2) * @@ -325,13 +326,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( K, J+1 ).NE.ZERO ) THEN - ALPHA = ONE / A( K, J+1 ) - CALL DCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) - CALL DSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) - ELSE - CALL DLASET( 'Full', 1, M-J-1, ZERO, ZERO, - $ A( K, J+2 ), LDA) + IF( J.LT.(M-1) ) THEN + IF( A( K, J+1 ).NE.ZERO ) THEN + ALPHA = ONE / A( K, J+1 ) + CALL DCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) + CALL DSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) + ELSE + CALL DLASET( 'Full', 1, M-J-1, ZERO, ZERO, + $ A( K, J+2 ), LDA) + END IF END IF END IF J = J + 1 @@ -432,8 +435,9 @@ * * Swap A(I2+1:M, I1) with A(I2+1:M, I2) * - CALL DSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, - $ A( I2+1, J1+I2-1 ), 1 ) + IF( I2.LT.M ) + $ CALL DSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, + $ A( I2+1, J1+I2-1 ), 1 ) * * Swap A(I1, I1) with A(I2, I2) * @@ -473,13 +477,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( J+1, K ).NE.ZERO ) THEN - ALPHA = ONE / A( J+1, K ) - CALL DCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) - CALL DSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) - ELSE - CALL DLASET( 'Full', M-J-1, 1, ZERO, ZERO, - $ A( J+2, K ), LDA ) + IF( J.LT.(M-1) ) THEN + IF( A( J+1, K ).NE.ZERO ) THEN + ALPHA = ONE / A( J+1, K ) + CALL DCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) + CALL DSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) + ELSE + CALL DLASET( 'Full', M-J-1, 1, ZERO, ZERO, + $ A( J+2, K ), LDA ) + END IF END IF END IF J = J + 1 diff --git a/lapack-netlib/SRC/dlasyf_rk.f b/lapack-netlib/SRC/dlasyf_rk.f index 209b4c89d..d581eeedc 100644 --- a/lapack-netlib/SRC/dlasyf_rk.f +++ b/lapack-netlib/SRC/dlasyf_rk.f @@ -321,7 +321,7 @@ * of A and working backwards, and compute the matrix W = U12*D * for use in updating A11 * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = ZERO @@ -649,7 +649,7 @@ * of A and working forwards, and compute the matrix W = L21*D * for use in updating A22 * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = ZERO * diff --git a/lapack-netlib/SRC/dlasyf_rook.f b/lapack-netlib/SRC/dlasyf_rook.f index 49ee7a6c9..557032104 100644 --- a/lapack-netlib/SRC/dlasyf_rook.f +++ b/lapack-netlib/SRC/dlasyf_rook.f @@ -21,7 +21,7 @@ * SUBROUTINE DLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, INFO ) * * .. Scalar Arguments .. -* CHARADLATER UPLO +* CHARACTER UPLO * INTEGER INFO, KB, LDA, LDW, N, NB * .. * .. Array Arguments .. diff --git a/lapack-netlib/SRC/dlatdf.f b/lapack-netlib/SRC/dlatdf.f index fd05059b3..8001e0830 100644 --- a/lapack-netlib/SRC/dlatdf.f +++ b/lapack-netlib/SRC/dlatdf.f @@ -85,7 +85,7 @@ *> RHS is DOUBLE PRECISION array, dimension (N) *> On entry, RHS contains contributions from other subsystems. *> On exit, RHS contains the solution of the subsystem with -*> entries acoording to the value of IJOB (see above). +*> entries according to the value of IJOB (see above). *> \endverbatim *> *> \param[in,out] RDSUM @@ -260,7 +260,7 @@ * * Solve for U-part, look-ahead for RHS(N) = +-1. This is not done * in BSOLVE and will hopefully give us a better estimate because -* any ill-conditioning of the original matrix is transfered to U +* any ill-conditioning of the original matrix is transferred to U * and not to L. U(N, N) is an approximation to sigma_min(LU). * CALL DCOPY( N-1, RHS, 1, XP, 1 ) diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f index 1ce7c4de0..598d2938e 100644 --- a/lapack-netlib/SRC/dlatsqr.f +++ b/lapack-netlib/SRC/dlatsqr.f @@ -1,3 +1,4 @@ +*> \brief \b DLATSQR * * Definition: * =========== @@ -19,8 +20,22 @@ *> \verbatim *> *> DLATSQR computes a blocked Tall-Skinny QR factorization of -*> an M-by-N matrix A, where M >= N: -*> A = Q * R . +*> a real M-by-N matrix A for M >= N: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix, stored on exit in an implicit +*> form in the elements below the digonal of the array A and in +*> the elemenst of the array T; +*> +*> R is an upper-triangular N-by-N matrix, stored on exit in +*> the elements on and above the diagonal of the array A. +*> +*> 0 is a (M-N)-by-N zero matrix, and is not stored. +*> *> \endverbatim * * Arguments: @@ -149,10 +164,10 @@ SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, $ LWORK, INFO) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK diff --git a/lapack-netlib/SRC/dorgtsqr.f b/lapack-netlib/SRC/dorgtsqr.f new file mode 100644 index 000000000..85b05b6b5 --- /dev/null +++ b/lapack-netlib/SRC/dorgtsqr.f @@ -0,0 +1,306 @@ +*> \brief \b DORGTSQR +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DORGTSQR + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE DORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, +* $ INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGTSQR generates an M-by-N real matrix Q_out with orthonormal columns, +*> which are the first N columns of a product of real orthogonal +*> matrices of order M which are returned by DLATSQR +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> See the documentation for DLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by DLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by DLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not accessed. +*> The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by DLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored) (same format as the output A +*> below the diagonal in DLATSQR). +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of NIRB block reflector sequences +*> is stored in a larger NB-by-N column block of T and consists +*> of NICB smaller NB-by-NB upper-triangular column blocks. +*> (same format as the output T in DLATSQR). +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB1,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) DOUBLE PRECISION array, dimension (MAX(2,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. LWORK >= (M+NB)*N. +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DLAMTSQR, DLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + LQUERY = LWORK.EQ.-1 + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array C(LDC, N) and WORK(LWORK) +* in the call to DLAMTSQR. See the documentation for DLAMTSQR. +* + IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN + INFO = -10 + ELSE +* +* Set block size for column blocks +* + NBLOCAL = MIN( NB, N ) +* +* LWORK = -1, then set the size for the array C(LDC,N) +* in DLAMTSQR call and set the optimal size of the work array +* WORK(LWORK) in DLAMTSQR call. +* + LDC = M + LC = LDC*N + LW = N * NBLOCAL +* + LWORKOPT = LC+LW +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -10 + END IF + END IF +* + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DORGTSQR', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in +* of M-by-M orthogonal matrix Q_in, which is implicitly stored in +* the subdiagonal part of input array A and in the input array T. +* Perform by the following operation using the routine DLAMTSQR. +* +* Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix, +* ( 0 ) 0 is a (M-N)-by-N zero matrix. +* +* (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones +* on the diagonal and zeros elsewhere. +* + CALL DLASET( 'F', M, N, ZERO, ONE, WORK, LDC ) +* +* (1b) On input, WORK(1:LDC*N) stores ( I ); +* ( 0 ) +* +* On output, WORK(1:LDC*N) stores Q1_in. +* + CALL DLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT, + $ WORK, LDC, WORK( LC+1 ), LW, IINFO ) +* +* (2) Copy the result from the part of the work array (1:M,1:N) +* with the leading dimension LDC that starts at WORK(1) into +* the output array A(1:M,1:N) column-by-column. +* + DO J = 1, N + CALL DCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 ) + END DO +* + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN +* +* End of DORGTSQR +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/dorhr_col.f b/lapack-netlib/SRC/dorhr_col.f new file mode 100644 index 000000000..b5a65973d --- /dev/null +++ b/lapack-netlib/SRC/dorhr_col.f @@ -0,0 +1,440 @@ +*> \brief \b DORHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DORHR_COL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE DORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORHR_COL takes an M-by-N real matrix Q_in with orthonormal columns +*> as input, stored in A, and performs Householder Reconstruction (HR), +*> i.e. reconstructs Householder vectors V(i) implicitly representing +*> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S, +*> where S is an N-by-N diagonal matrix with diagonal entries +*> equal to +1 or -1. The Householder vectors (columns V(i) of V) are +*> stored in A on output, and the diagonal entries of S are stored in D. +*> Block reflectors are also returned in T +*> (same output format as DGEQRT). +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size to be used in the reconstruction +*> of Householder column vector blocks in the array A and +*> corresponding block reflectors in the array T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size.) +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> +*> The array A contains an M-by-N orthonormal matrix Q_in, +*> i.e the columns of A are orthogonal unit vectors. +*> +*> On exit: +*> +*> The elements below the diagonal of A represent the unit +*> lower-trapezoidal matrix V of Householder column vectors +*> V(i). The unit diagonal entries of V are not stored +*> (same format as the output below the diagonal in A from +*> DGEQRT). The matrix T and the matrix V stored on output +*> in A implicitly define Q_out. +*> +*> The elements above the diagonal contain the factor U +*> of the "modified" LU-decomposition: +*> Q_in - ( S ) = V * U +*> ( 0 ) +*> where 0 is a (M-N)-by-(M-N) zero matrix. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is DOUBLE PRECISION array, +*> dimension (LDT, N) +*> +*> Let NOCB = Number_of_output_col_blocks +*> = CEIL(N/NB) +*> +*> On exit, T(1:NB, 1:N) contains NOCB upper-triangular +*> block reflectors used to define Q_out stored in compact +*> form as a sequence of upper-triangular NB-by-NB column +*> blocks (same format as the output T in DGEQRT). +*> The matrix T and the matrix V stored on output in A +*> implicitly define Q_out. NOTE: The lower triangles +*> below the upper-triangular blcoks will be filled with +*> zeros. See Further Details. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is DOUBLE PRECISION array, dimension min(M,N). +*> The elements can be only plus or minus one. +*> +*> D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where +*> 1 <= i <= min(M,N), and Q_in_i is Q_in after performing +*> i-1 steps of “modified” Gaussian elimination. +*> See Further Details. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The computed M-by-M orthogonal factor Q_out is defined implicitly as +*> a product of orthogonal matrices Q_out(i). Each Q_out(i) is stored in +*> the compact WY-representation format in the corresponding blocks of +*> matrices V (stored in A) and T. +*> +*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N +*> matrix A contains the column vectors V(i) in NB-size column +*> blocks VB(j). For example, VB(1) contains the columns +*> V(1), V(2), ... V(NB). NOTE: The unit entries on +*> the diagonal of Y are not stored in A. +*> +*> The number of column blocks is +*> +*> NOCB = Number_of_output_col_blocks = CEIL(N/NB) +*> +*> where each block is of order NB except for the last block, which +*> is of order LAST_NB = N - (NOCB-1)*NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix V is +*> +*> +*> V = ( VB(1), VB(2), VB(3) ) = +*> +*> = ( 1 ) +*> ( v21 1 ) +*> ( v31 v32 1 ) +*> ( v41 v42 v43 1 ) +*> ( v51 v52 v53 v54 1 ) +*> ( v61 v62 v63 v54 v65 ) +*> +*> +*> For each of the column blocks VB(i), an upper-triangular block +*> reflector TB(i) is computed. These blocks are stored as +*> a sequence of upper-triangular column blocks in the NB-by-N +*> matrix T. The size of each TB(i) block is NB-by-NB, except +*> for the last block, whose size is LAST_NB-by-LAST_NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix T is +*> +*> T = ( TB(1), TB(2), TB(3) ) = +*> +*> = ( t11 t12 t13 t14 t15 ) +*> ( t22 t24 ) +*> +*> +*> The M-by-M factor Q_out is given as a product of NOCB +*> orthogonal M-by-M matrices Q_out(i). +*> +*> Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB), +*> +*> where each matrix Q_out(i) is given by the WY-representation +*> using corresponding blocks from the matrices V and T: +*> +*> Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T, +*> +*> where I is the identity matrix. Here is the formula with matrix +*> dimensions: +*> +*> Q(i){M-by-M} = I{M-by-M} - +*> VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M}, +*> +*> where INB = NB, except for the last block NOCB +*> for which INB=LAST_NB. +*> +*> ===== +*> NOTE: +*> ===== +*> +*> If Q_in is the result of doing a QR factorization +*> B = Q_in * R_in, then: +*> +*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out. +*> +*> So if one wants to interpret Q_out as the result +*> of the QR factorization of B, then corresponding R_out +*> should be obtained by R_out = S * R_in, i.e. some rows of R_in +*> should be multiplied by -1. +*> +*> For the details of the algorithm, see [1]. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB, + $ NPLUSONE +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DLAORHR_COL_GETRFNP, DSCAL, DTRSM, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. N.GT.M ) THEN + INFO = -2 + ELSE IF( NB.LT.1 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -5 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -7 + END IF +* +* Handle error in the input parameters. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DORHR_COL', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + RETURN + END IF +* +* On input, the M-by-N matrix A contains the orthogonal +* M-by-N matrix Q_in. +* +* (1) Compute the unit lower-trapezoidal V (ones on the diagonal +* are not stored) by performing the "modified" LU-decomposition. +* +* Q_in - ( S ) = V * U = ( V1 ) * U, +* ( 0 ) ( V2 ) +* +* where 0 is an (M-N)-by-N zero matrix. +* +* (1-1) Factor V1 and U. + + CALL DLAORHR_COL_GETRFNP( N, N, A, LDA, D, IINFO ) +* +* (1-2) Solve for V2. +* + IF( M.GT.N ) THEN + CALL DTRSM( 'R', 'U', 'N', 'N', M-N, N, ONE, A, LDA, + $ A( N+1, 1 ), LDA ) + END IF +* +* (2) Reconstruct the block reflector T stored in T(1:NB, 1:N) +* as a sequence of upper-triangular blocks with NB-size column +* blocking. +* +* Loop over the column blocks of size NB of the array A(1:M,1:N) +* and the array T(1:NB,1:N), JB is the column index of a column +* block, JNB is the column block size at each step JB. +* + NPLUSONE = N + 1 + DO JB = 1, N, NB +* +* (2-0) Determine the column block size JNB. +* + JNB = MIN( NPLUSONE-JB, NB ) +* +* (2-1) Copy the upper-triangular part of the current JNB-by-JNB +* diagonal block U(JB) (of the N-by-N matrix U) stored +* in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part +* of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1) +* column-by-column, total JNB*(JNB+1)/2 elements. +* + JBTEMP1 = JB - 1 + DO J = JB, JB+JNB-1 + CALL DCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 ) + END DO +* +* (2-2) Perform on the upper-triangular part of the current +* JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored +* in T(1:JNB,JB:JB+JNB-1) the following operation in place: +* (-1)*U(JB)*S(JB), i.e the result will be stored in the upper- +* triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication +* of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB +* diagonal block S(JB) of the N-by-N sign matrix S from the +* right means changing the sign of each J-th column of the block +* U(JB) according to the sign of the diagonal element of the block +* S(JB), i.e. S(J,J) that is stored in the array element D(J). +* + DO J = JB, JB+JNB-1 + IF( D( J ).EQ.ONE ) THEN + CALL DSCAL( J-JBTEMP1, -ONE, T( 1, J ), 1 ) + END IF + END DO +* +* (2-3) Perform the triangular solve for the current block +* matrix X(JB): +* +* X(JB) * (A(JB)**T) = B(JB), where: +* +* A(JB)**T is a JNB-by-JNB unit upper-triangular +* coefficient block, and A(JB)=V1(JB), which +* is a JNB-by-JNB unit lower-triangular block +* stored in A(JB:JB+JNB-1,JB:JB+JNB-1). +* The N-by-N matrix V1 is the upper part +* of the M-by-N lower-trapezoidal matrix V +* stored in A(1:M,1:N); +* +* B(JB) is a JNB-by-JNB upper-triangular right-hand +* side block, B(JB) = (-1)*U(JB)*S(JB), and +* B(JB) is stored in T(1:JNB,JB:JB+JNB-1); +* +* X(JB) is a JNB-by-JNB upper-triangular solution +* block, X(JB) is the upper-triangular block +* reflector T(JB), and X(JB) is stored +* in T(1:JNB,JB:JB+JNB-1). +* +* In other words, we perform the triangular solve for the +* upper-triangular block T(JB): +* +* T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB). +* +* Even though the blocks X(JB) and B(JB) are upper- +* triangular, the routine DTRSM will access all JNB**2 +* elements of the square T(1:JNB,JB:JB+JNB-1). Therefore, +* we need to set to zero the elements of the block +* T(1:JNB,JB:JB+JNB-1) below the diagonal before the call +* to DTRSM. +* +* (2-3a) Set the elements to zero. +* + JBTEMP2 = JB - 2 + DO J = JB, JB+JNB-2 + DO I = J-JBTEMP2, NB + T( I, J ) = ZERO + END DO + END DO +* +* (2-3b) Perform the triangular solve. +* + CALL DTRSM( 'R', 'L', 'T', 'U', JNB, JNB, ONE, + $ A( JB, JB ), LDA, T( 1, JB ), LDT ) +* + END DO +* + RETURN +* +* End of DORHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/dporfsx.f b/lapack-netlib/SRC/dporfsx.f index 53724925e..67cca9ccf 100644 --- a/lapack-netlib/SRC/dporfsx.f +++ b/lapack-netlib/SRC/dporfsx.f @@ -135,7 +135,7 @@ *> \param[in,out] S *> \verbatim *> S is DOUBLE PRECISION array, dimension (N) -*> The row scale factors for A. If EQUED = 'Y', A is multiplied on +*> The scale factors for A. If EQUED = 'Y', A is multiplied on *> the left and right by diag(S). S is an input argument if FACT = *> 'F'; otherwise, S is an output argument. If FACT = 'F' and EQUED *> = 'Y', each element of S must be positive. If S is output, each @@ -263,7 +263,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -299,14 +299,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension (NPARAMS) -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -314,9 +314,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/dposvxx.f b/lapack-netlib/SRC/dposvxx.f index 488e0b15a..b0de44910 100644 --- a/lapack-netlib/SRC/dposvxx.f +++ b/lapack-netlib/SRC/dposvxx.f @@ -366,7 +366,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -402,14 +402,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -417,9 +417,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/dsb2st_kernels.f b/lapack-netlib/SRC/dsb2st_kernels.f index 3bf126d5b..a9dc6b5ca 100644 --- a/lapack-netlib/SRC/dsb2st_kernels.f +++ b/lapack-netlib/SRC/dsb2st_kernels.f @@ -1,26 +1,26 @@ *> \brief \b DSB2ST_KERNELS * * @generated from zhb2st_kernels.f, fortran z -> d, Wed Dec 7 08:22:39 2016 -* +* * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download DSB2ST_KERNELS + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download DSB2ST_KERNELS + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, * ST, ED, SWEEP, N, NB, IB, * A, LDA, V, TAU, LDVT, WORK) * @@ -32,9 +32,9 @@ * INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. -* DOUBLE PRECISION A( LDA, * ), V( * ), +* DOUBLE PRECISION A( LDA, * ), V( * ), * TAU( * ), WORK( * ) -* +* *> \par Purpose: * ============= *> @@ -124,7 +124,7 @@ *> LDVT is INTEGER. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array. Workspace of size nb. *> \endverbatim @@ -147,7 +147,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -155,16 +155,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, + SUBROUTINE DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, $ ST, ED, SWEEP, N, NB, IB, $ A, LDA, V, TAU, LDVT, WORK) * @@ -181,7 +181,7 @@ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), V( * ), + DOUBLE PRECISION A( LDA, * ), V( * ), $ TAU( * ), WORK( * ) * .. * @@ -195,8 +195,8 @@ * .. Local Scalars .. LOGICAL UPPER INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, - $ DPOS, OFDPOS, AJETER - DOUBLE PRECISION CTMP + $ DPOS, OFDPOS, AJETER + DOUBLE PRECISION CTMP * .. * .. External Subroutines .. EXTERNAL DLARFG, DLARFX, DLARFY @@ -209,7 +209,7 @@ * .. * .. * .. Executable Statements .. -* +* AJETER = IB + LDVT UPPER = LSAME( UPLO, 'U' ) @@ -240,10 +240,10 @@ V( VPOS ) = ONE DO 10 I = 1, LM-1 V( VPOS+I ) = ( A( OFDPOS-I, ST+I ) ) - A( OFDPOS-I, ST+I ) = ZERO + A( OFDPOS-I, ST+I ) = ZERO 10 CONTINUE CTMP = ( A( OFDPOS, ST ) ) - CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, + CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) A( OFDPOS, ST ) = CTMP * @@ -281,14 +281,14 @@ * V( VPOS ) = ONE DO 30 I = 1, LM-1 - V( VPOS+I ) = + V( VPOS+I ) = $ ( A( DPOS-NB-I, J1+I ) ) A( DPOS-NB-I, J1+I ) = ZERO 30 CONTINUE CTMP = ( A( DPOS-NB, J1 ) ) CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) A( DPOS-NB, J1 ) = CTMP -* +* CALL DLARFX( 'Right', LN-1, LM, V( VPOS ), $ TAU( TAUPOS ), $ A( DPOS-NB+1, J1 ), LDA-1, WORK) @@ -296,9 +296,9 @@ ENDIF * * Lower case -* +* ELSE -* +* IF( WANTZ ) THEN VPOS = MOD( SWEEP-1, 2 ) * N + ST TAUPOS = MOD( SWEEP-1, 2 ) * N + ST @@ -313,9 +313,9 @@ V( VPOS ) = ONE DO 20 I = 1, LM-1 V( VPOS+I ) = A( OFDPOS+I, ST-1 ) - A( OFDPOS+I, ST-1 ) = ZERO + A( OFDPOS+I, ST-1 ) = ZERO 20 CONTINUE - CALL DLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + CALL DLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * LM = ED - ST + 1 @@ -342,7 +342,7 @@ LM = J2-J1+1 * IF( LM.GT.0) THEN - CALL DLARFX( 'Right', LM, LN, V( VPOS ), + CALL DLARFX( 'Right', LM, LN, V( VPOS ), $ TAU( TAUPOS ), A( DPOS+NB, ST ), $ LDA-1, WORK) * @@ -359,13 +359,13 @@ V( VPOS+I ) = A( DPOS+NB+I, ST ) A( DPOS+NB+I, ST ) = ZERO 40 CONTINUE - CALL DLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + CALL DLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * - CALL DLARFX( 'Left', LM, LN-1, V( VPOS ), + CALL DLARFX( 'Left', LM, LN-1, V( VPOS ), $ ( TAU( TAUPOS ) ), $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) - + ENDIF ENDIF ENDIF @@ -374,4 +374,4 @@ * * END OF DSB2ST_KERNELS * - END + END diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f index eab5ebcbb..6de1eb89b 100644 --- a/lapack-netlib/SRC/dsbgvx.f +++ b/lapack-netlib/SRC/dsbgvx.f @@ -261,11 +261,11 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit -*> < 0 : if INFO = -i, the i-th argument had an illegal value +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value *> <= N: if INFO = i, then i eigenvectors failed to converge. *> Their indices are stored in IFAIL. -*> > N : DPBSTF returned an error code; i.e., +*> > N: DPBSTF returned an error code; i.e., *> if INFO = N + i, for 1 <= i <= N, then the leading *> minor of order i of B is not positive definite. *> The factorization of B could not be completed and diff --git a/lapack-netlib/SRC/dsgesv.f b/lapack-netlib/SRC/dsgesv.f index f47327d00..edbb87e7a 100644 --- a/lapack-netlib/SRC/dsgesv.f +++ b/lapack-netlib/SRC/dsgesv.f @@ -92,9 +92,9 @@ *> dimension (LDA,N) *> On entry, the N-by-N coefficient matrix A. *> On exit, if iterative refinement has been successfully used -*> (INFO.EQ.0 and ITER.GE.0, see description below), then A is +*> (INFO = 0 and ITER >= 0, see description below), then A is *> unchanged, if double precision factorization has been used -*> (INFO.EQ.0 and ITER.LT.0, see description below), then the +*> (INFO = 0 and ITER < 0, see description below), then the *> array A contains the factors L and U from the factorization *> A = P*L*U; the unit diagonal elements of L are not stored. *> \endverbatim @@ -111,8 +111,8 @@ *> The pivot indices that define the permutation matrix P; *> row i of the matrix was interchanged with row IPIV(i). *> Corresponds either to the single precision factorization -*> (if INFO.EQ.0 and ITER.GE.0) or the double precision -*> factorization (if INFO.EQ.0 and ITER.LT.0). +*> (if INFO = 0 and ITER >= 0) or the double precision +*> factorization (if INFO = 0 and ITER < 0). *> \endverbatim *> *> \param[in] B @@ -406,7 +406,7 @@ 30 CONTINUE * * If we are at this place of the code, this is because we have -* performed ITER=ITERMAX iterations and never satisified the +* performed ITER=ITERMAX iterations and never satisfied the * stopping criterion, set up the ITER flag accordingly and follow up * on double precision routine. * diff --git a/lapack-netlib/SRC/dsposv.f b/lapack-netlib/SRC/dsposv.f index 4a8575241..6c8baa56b 100644 --- a/lapack-netlib/SRC/dsposv.f +++ b/lapack-netlib/SRC/dsposv.f @@ -106,9 +106,9 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> On exit, if iterative refinement has been successfully used -*> (INFO.EQ.0 and ITER.GE.0, see description below), then A is +*> (INFO = 0 and ITER >= 0, see description below), then A is *> unchanged, if double precision factorization has been used -*> (INFO.EQ.0 and ITER.LT.0, see description below), then the +*> (INFO = 0 and ITER < 0, see description below), then the *> array A contains the factor U or L from the Cholesky *> factorization A = U**T*U or A = L*L**T. *> \endverbatim @@ -413,7 +413,7 @@ 30 CONTINUE * * If we are at this place of the code, this is because we have -* performed ITER=ITERMAX iterations and never satisified the +* performed ITER=ITERMAX iterations and never satisfied the * stopping criterion, set up the ITER flag accordingly and follow * up on double precision routine. * diff --git a/lapack-netlib/SRC/dstemr.f b/lapack-netlib/SRC/dstemr.f index a1a8e3433..16c9d970d 100644 --- a/lapack-netlib/SRC/dstemr.f +++ b/lapack-netlib/SRC/dstemr.f @@ -233,13 +233,13 @@ *> \param[in,out] TRYRAC *> \verbatim *> TRYRAC is LOGICAL -*> If TRYRAC.EQ..TRUE., indicates that the code should check whether +*> If TRYRAC = .TRUE., indicates that the code should check whether *> the tridiagonal matrix defines its eigenvalues to high relative *> accuracy. If so, the code uses relative-accuracy preserving *> algorithms that might be (a bit) slower depending on the matrix. *> If the matrix does not define its eigenvalues to high relative *> accuracy, the code can uses possibly faster algorithms. -*> If TRYRAC.EQ..FALSE., the code is not required to guarantee +*> If TRYRAC = .FALSE., the code is not required to guarantee *> relatively accurate eigenvalues and can use the fastest possible *> techniques. *> On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix diff --git a/lapack-netlib/SRC/dsyconvf.f b/lapack-netlib/SRC/dsyconvf.f index 37c8157ba..60cfd1e65 100644 --- a/lapack-netlib/SRC/dsyconvf.f +++ b/lapack-netlib/SRC/dsyconvf.f @@ -291,7 +291,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -344,7 +344,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -435,7 +435,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where k increases from 1 to N * I = 1 @@ -488,7 +488,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/dsyconvf_rook.f b/lapack-netlib/SRC/dsyconvf_rook.f index 5c774906e..bd683a087 100644 --- a/lapack-netlib/SRC/dsyconvf_rook.f +++ b/lapack-netlib/SRC/dsyconvf_rook.f @@ -282,7 +282,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -333,7 +333,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -423,7 +423,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where i increases from 1 to N * I = 1 @@ -474,7 +474,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f index fff0dedbc..9d802905c 100644 --- a/lapack-netlib/SRC/dsyev_2stage.f +++ b/lapack-netlib/SRC/dsyev_2stage.f @@ -317,7 +317,7 @@ IF( .NOT.WANTZ ) THEN CALL DSTERF( N, W, WORK( INDE ), INFO ) ELSE -* Not available in this release, and agrument checking should not +* Not available in this release, and argument checking should not * let it getting here RETURN CALL DORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ), diff --git a/lapack-netlib/SRC/dsyevd_2stage.f b/lapack-netlib/SRC/dsyevd_2stage.f index 75a6da436..ff8e08d71 100644 --- a/lapack-netlib/SRC/dsyevd_2stage.f +++ b/lapack-netlib/SRC/dsyevd_2stage.f @@ -385,7 +385,7 @@ IF( .NOT.WANTZ ) THEN CALL DSTERF( N, W, WORK( INDE ), INFO ) ELSE -* Not available in this release, and agrument checking should not +* Not available in this release, and argument checking should not * let it getting here RETURN CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N, diff --git a/lapack-netlib/SRC/dsyrfsx.f b/lapack-netlib/SRC/dsyrfsx.f index e128cd4e0..eb091e720 100644 --- a/lapack-netlib/SRC/dsyrfsx.f +++ b/lapack-netlib/SRC/dsyrfsx.f @@ -271,7 +271,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -307,14 +307,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension (NPARAMS) -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -322,9 +322,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index 7192928c6..4ee474448 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -42,7 +42,7 @@ *> matrices. *> *> Aasen's algorithm is used to factor A as -*> A = U * T * U**T, if UPLO = 'U', or +*> A = U**T * T * U, if UPLO = 'U', or *> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric tridiagonal. The factored @@ -86,7 +86,7 @@ *> *> On exit, if INFO = 0, the tridiagonal matrix T and the *> multipliers used to obtain the factor U or L from the -*> factorization A = U*T*U**T or A = L*T*L**T as computed by +*> factorization A = U**T*T*U or A = L*T*L**T as computed by *> DSYTRF. *> \endverbatim *> @@ -230,7 +230,7 @@ RETURN END IF * -* Compute the factorization A = U*T*U**T or A = L*T*L**T. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) IF( INFO.EQ.0 ) THEN diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index 05e538f0b..ef593bc7e 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -45,7 +45,7 @@ *> matrices. *> *> Aasen's 2-stage algorithm is used to factor A as -*> A = U * T * U**T, if UPLO = 'U', or +*> A = U**T * T * U, if UPLO = 'U', or *> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric and band. The matrix T is @@ -259,7 +259,7 @@ END IF * * -* Compute the factorization A = U*T*U**T or A = L*T*L**T. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, $ WORK, LWORK, INFO ) diff --git a/lapack-netlib/SRC/dsysvxx.f b/lapack-netlib/SRC/dsysvxx.f index 6e167d81e..0be50bcd1 100644 --- a/lapack-netlib/SRC/dsysvxx.f +++ b/lapack-netlib/SRC/dsysvxx.f @@ -377,7 +377,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -413,14 +413,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension (NPARAMS) -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -428,9 +428,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/dsytf2_rk.f b/lapack-netlib/SRC/dsytf2_rk.f index 45cf62ab9..cc9e4616e 100644 --- a/lapack-netlib/SRC/dsytf2_rk.f +++ b/lapack-netlib/SRC/dsytf2_rk.f @@ -312,7 +312,7 @@ * * Factorize A as U*D*U**T using the upper triangle of A * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = ZERO @@ -623,7 +623,7 @@ * * Factorize A as L*D*L**T using the lower triangle of A * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = ZERO * diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f index 522602bb2..fc4b92908 100644 --- a/lapack-netlib/SRC/dsytrd_2stage.f +++ b/lapack-netlib/SRC/dsytrd_2stage.f @@ -123,23 +123,22 @@ *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is DOUBLE PRECISION array, dimension LHOUS2, that -*> store the Householder representation of the stage2 +*> HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2) +*> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim *> *> \param[in] LHOUS2 *> \verbatim *> LHOUS2 is INTEGER -*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array HOUS2. +*> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns *> this value as the first entry of the HOUS2 array, and no error *> message related to LHOUS2 is issued by XERBLA. -*> LHOUS2 = MAX(1, dimension) where -*> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> If VECT='N', LHOUS2 = max(1, 4*n); +*> if VECT='V', option not yet available. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index 4d81fe226..0c0dbf125 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -50,9 +50,9 @@ * Arguments: * ========== * -*> \param[in] STAGE +*> \param[in] STAGE1 *> \verbatim -*> STAGE is CHARACTER*1 +*> STAGE1 is CHARACTER*1 *> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the dsytrd_sy2sb routine *> was not called before this routine to reproduce AB. diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index e0a5debc5..7f30817b0 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -363,7 +363,7 @@ * * * Set the workspace of the triangular matrix T to zero once such a -* way everytime T is generated the upper/lower portion will be always zero +* way every time T is generated the upper/lower portion will be always zero * CALL DLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) * diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index d8da4f122..a3bd30a2f 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -39,7 +39,7 @@ *> the Bunch-Kaufman diagonal pivoting method. The form of the *> factorization is *> -*> A = U*D*U**T or A = L*D*L**T +*> A = U**T*D*U or A = L*D*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and D is symmetric and block diagonal with @@ -144,7 +144,7 @@ *> *> \verbatim *> -*> If UPLO = 'U', then A = U*D*U**T, where +*> If UPLO = 'U', then A = U**T*D*U, where *> U = P(n)*U(n)* ... *P(k)U(k)* ..., *> i.e., U is a product of terms P(k)*U(k), where k decreases from n to *> 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 @@ -262,7 +262,7 @@ * IF( UPPER ) THEN * -* Factorize A as U*D*U**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * * K is the main loop index, decreasing from N to 1 in steps of * KB, where KB is the number of columns factorized by DLASYF; diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f index 24b3f393b..6df0da2cd 100644 --- a/lapack-netlib/SRC/dsytrf_aa.f +++ b/lapack-netlib/SRC/dsytrf_aa.f @@ -37,7 +37,7 @@ *> DSYTRF_AA computes the factorization of a real symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a symmetric tridiagonal matrix. @@ -223,7 +223,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * * Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N)) @@ -256,7 +256,7 @@ $ A( MAX(1, J), J+1 ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J @@ -375,7 +375,7 @@ $ A( J+1, MAX(1, J) ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index 25fc1a2eb..a37be5bdd 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -38,7 +38,7 @@ *> DSYTRF_AA_2STAGE computes the factorization of a real symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a symmetric band matrix with the @@ -103,6 +103,22 @@ *> no error message related to LTB is issued by XERBLA. *> \endverbatim *> +*> \param[out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> On exit, it contains the details of the interchanges, i.e., +*> the row and column k of A were interchanged with the +*> row and column IPIV(k). +*> \endverbatim +*> +*> \param[out] IPIV2 +*> \verbatim +*> IPIV2 is INTEGER array, dimension (N) +*> On exit, it contains the details of the interchanges, i.e., +*> the row and column k of T were interchanged with the +*> row and column IPIV2(k). +*> \endverbatim +*> *> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION workspace of size LWORK @@ -120,22 +136,6 @@ *> no error message related to LWORK is issued by XERBLA. *> \endverbatim *> -*> \param[out] IPIV -*> \verbatim -*> IPIV is INTEGER array, dimension (N) -*> On exit, it contains the details of the interchanges, i.e., -*> the row and column k of A were interchanged with the -*> row and column IPIV(k). -*> \endverbatim -*> -*> \param[out] IPIV2 -*> \verbatim -*> IPIV2 is INTEGER array, dimension (N) -*> On exit, it contains the details of the interchanges, i.e., -*> the row and column k of T were interchanged with the -*> row and column IPIV2(k). -*> \endverbatim -*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -275,7 +275,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * DO J = 0, NT-1 @@ -442,12 +442,14 @@ c END IF * > Apply pivots to previous columns of L CALL DSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, $ A( (J+1)*NB+1, I2 ), 1 ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL DSWAP( I2-I1-1, A( I1, I1+1 ), LDA, - $ A( I1+1, I2 ), 1 ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) + $ CALL DSWAP( I2-I1-1, A( I1, I1+1 ), LDA, + $ A( I1+1, I2 ), 1 ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL DSWAP( N-I2, A( I1, I2+1 ), LDA, - $ A( I2, I2+1 ), LDA ) + IF( I2.LT.N ) + $ CALL DSWAP( N-I2, A( I1, I2+1 ), LDA, + $ A( I2, I2+1 ), LDA ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) @@ -616,11 +618,13 @@ c END IF CALL DSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, $ A( I2, (J+1)*NB+1 ), LDA ) * > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL DSWAP( I2-I1-1, A( I1+1, I1 ), 1, - $ A( I2, I1+1 ), LDA ) + IF( I2.GT.(I1+1) ) + $ CALL DSWAP( I2-I1-1, A( I1+1, I1 ), 1, + $ A( I2, I1+1 ), LDA ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL DSWAP( N-I2, A( I2+1, I1 ), 1, - $ A( I2+1, I2 ), 1 ) + IF( I2.LT.N ) + $ CALL DSWAP( N-I2, A( I2+1, I1 ), 1, + $ A( I2+1, I2 ), 1 ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index 23f8b9fa2..5c3a5ec76 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -62,7 +62,7 @@ *> \param[in,out] A *> \verbatim *> A is DOUBLE PRECISION array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers +*> On entry, the block diagonal matrix D and the multipliers *> used to obtain the factor U or L as computed by DSYTRF. *> *> On exit, if INFO = 0, the (symmetric) inverse of the original @@ -82,7 +82,7 @@ *> \param[in] IPIV *> \verbatim *> IPIV is INTEGER array, dimension (N) -*> Details of the interchanges and the NB structure of D +*> Details of the interchanges and the block structure of D *> as determined by DSYTRF. *> \endverbatim *> diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f index 05ef31ff3..d9dc0a6d1 100644 --- a/lapack-netlib/SRC/dsytrs_aa.f +++ b/lapack-netlib/SRC/dsytrs_aa.f @@ -37,7 +37,7 @@ *> \verbatim *> *> DSYTRS_AA solves a system of linear equations A*X = B with a real -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by DSYTRF_AA. *> \endverbatim * @@ -49,7 +49,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -97,14 +97,16 @@ *> The leading dimension of the array B. LDB >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim -*> WORK is DOUBLE array, dimension (MAX(1,LWORK)) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER, LWORK >= MAX(1,3*N-2). +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -198,22 +200,29 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. +* +* 1) Forward substitution with U**T +* + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B * -* Pivot, P**T * B + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Compute U**T \ B -> B [ (U**T \P**T * B) ] * -* Compute (U \P**T * B) -> B [ (U \P**T * B) ] + CALL DTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) + END IF * - CALL DTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) +* 2) Solve with triangular matrix T * -* Compute T \ B -> B [ T \ (U \P**T * B) ] +* Compute T \ B -> B [ T \ (U**T \P**T * B) ] * CALL DLACPY( 'F', 1, N, A( 1, 1 ), LDA+1, WORK( N ), 1) IF( N.GT.1 ) THEN @@ -223,35 +232,47 @@ CALL DGTSV( N, NRHS, WORK( 1 ), WORK( N ), WORK( 2*N ), B, LDB, $ INFO ) * -* Compute (U**T \ B) -> B [ U**T \ (T \ (U \P**T * B) ) ] +* 3) Backward substitution with U +* + IF( N.GT.1 ) THEN * - CALL DTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) +* Compute U \ B -> B [ U \ (T \ (U**T \P**T * B) ) ] * -* Pivot, P * B [ P * (U**T \ (T \ (U \P**T * B) )) ] + CALL DTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Pivot, P * B -> B [ P * (U \ (T \ (U**T \P**T * B) )) ] +* + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * ELSE * * Solve A*X = B, where A = L*T*L**T. * -* Pivot, P**T * B +* 1) Forward substitution with L * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B +* + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute L \ B -> B [ (L \P**T * B) ] +* + CALL DTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) + END IF * - CALL DTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) +* 2) Solve with triangular matrix T * * Compute T \ B -> B [ T \ (L \P**T * B) ] * @@ -263,18 +284,23 @@ CALL DGTSV( N, NRHS, WORK( 1 ), WORK(N), WORK( 2*N ), B, LDB, $ INFO) * -* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] +* 3) Backward substitution with L**T * - CALL DTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) + IF( N.GT.1 ) THEN +* +* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] + CALL DTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * END IF * diff --git a/lapack-netlib/SRC/dsytrs_aa_2stage.f b/lapack-netlib/SRC/dsytrs_aa_2stage.f index bb283cb95..69c702f8a 100644 --- a/lapack-netlib/SRC/dsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrs_aa_2stage.f @@ -36,7 +36,7 @@ *> \verbatim *> *> DSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a real -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by DSYTRF_AA_2STAGE. *> \endverbatim * @@ -48,7 +48,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -208,15 +208,15 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (U**T \P**T * B) -> B [ (U**T \P**T * B) ] +* Compute (U**T \ B) -> B [ (U**T \P**T * B) ] * CALL DTRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) @@ -234,7 +234,7 @@ CALL DTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (U \ (T \ (U**T \P**T * B) )) ] +* Pivot, P * B -> B [ P * (U \ (T \ (U**T \P**T * B) )) ] * CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * @@ -246,11 +246,11 @@ * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute (L \ B) -> B [ (L \P**T * B) ] * CALL DTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) @@ -268,7 +268,7 @@ CALL DTRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] * CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * diff --git a/lapack-netlib/SRC/dtgsy2.f b/lapack-netlib/SRC/dtgsy2.f index 1c687b15e..e8c9b4001 100644 --- a/lapack-netlib/SRC/dtgsy2.f +++ b/lapack-netlib/SRC/dtgsy2.f @@ -71,7 +71,7 @@ *> R * B**T + L * E**T = scale * -F *> *> This case is used to compute an estimate of Dif[(A, D), (B, E)] = -*> sigma_min(Z) using reverse communicaton with DLACON. +*> sigma_min(Z) using reverse communication with DLACON. *> *> DTGSY2 also (IJOB >= 1) contributes to the computation in DTGSYL *> of an upper bound on the separation between to matrix pairs. Then @@ -85,7 +85,7 @@ *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 -*> = 'N', solve the generalized Sylvester equation (1). +*> = 'N': solve the generalized Sylvester equation (1). *> = 'T': solve the 'transposed' system (3). *> \endverbatim *> diff --git a/lapack-netlib/SRC/dtgsyl.f b/lapack-netlib/SRC/dtgsyl.f index 1cc3a1bf8..bb0751794 100644 --- a/lapack-netlib/SRC/dtgsyl.f +++ b/lapack-netlib/SRC/dtgsyl.f @@ -88,20 +88,20 @@ *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 -*> = 'N', solve the generalized Sylvester equation (1). -*> = 'T', solve the 'transposed' system (3). +*> = 'N': solve the generalized Sylvester equation (1). +*> = 'T': solve the 'transposed' system (3). *> \endverbatim *> *> \param[in] IJOB *> \verbatim *> IJOB is INTEGER *> Specifies what kind of functionality to be performed. -*> =0: solve (1) only. -*> =1: The functionality of 0 and 3. -*> =2: The functionality of 0 and 4. -*> =3: Only an estimate of Dif[(A,D), (B,E)] is computed. +*> = 0: solve (1) only. +*> = 1: The functionality of 0 and 3. +*> = 2: The functionality of 0 and 4. +*> = 3: Only an estimate of Dif[(A,D), (B,E)] is computed. *> (look ahead strategy IJOB = 1 is used). -*> =4: Only an estimate of Dif[(A,D), (B,E)] is computed. +*> = 4: Only an estimate of Dif[(A,D), (B,E)] is computed. *> ( DGECON on sub-systems is used ). *> Not referenced if TRANS = 'T'. *> \endverbatim diff --git a/lapack-netlib/SRC/dtpmlqt.f b/lapack-netlib/SRC/dtpmlqt.f index 3782d0c71..975ebdc27 100644 --- a/lapack-netlib/SRC/dtpmlqt.f +++ b/lapack-netlib/SRC/dtpmlqt.f @@ -94,7 +94,7 @@ *> *> \param[in] V *> \verbatim -*> V is DOUBLE PRECISION array, dimension (LDA,K) +*> V is DOUBLE PRECISION array, dimension (LDV,K) *> The i-th row must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> DTPLQT in B. See Further Details. diff --git a/lapack-netlib/SRC/dtpmqrt.f b/lapack-netlib/SRC/dtpmqrt.f index 44985a80d..a7888e192 100644 --- a/lapack-netlib/SRC/dtpmqrt.f +++ b/lapack-netlib/SRC/dtpmqrt.f @@ -94,7 +94,7 @@ *> *> \param[in] V *> \verbatim -*> V is DOUBLE PRECISION array, dimension (LDA,K) +*> V is DOUBLE PRECISION array, dimension (LDV,K) *> The i-th column must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> CTPQRT in B. See Further Details. diff --git a/lapack-netlib/SRC/dtprfb.f b/lapack-netlib/SRC/dtprfb.f index 6ae8fad8c..6d3b4dae1 100644 --- a/lapack-netlib/SRC/dtprfb.f +++ b/lapack-netlib/SRC/dtprfb.f @@ -152,8 +152,8 @@ *> \verbatim *> LDA is INTEGER *> The leading dimension of the array A. -*> If SIDE = 'L', LDC >= max(1,K); -*> If SIDE = 'R', LDC >= max(1,M). +*> If SIDE = 'L', LDA >= max(1,K); +*> If SIDE = 'R', LDA >= max(1,M). *> \endverbatim *> *> \param[in,out] B diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index a438ada38..7f68c383d 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -132,7 +132,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2017 +*> \date November 2019 * *> \ingroup OTHERauxiliary * @@ -162,10 +162,10 @@ * ===================================================================== INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) * -* -- LAPACK auxiliary routine (version 3.8.0) -- +* -- LAPACK auxiliary routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2017 +* November 2019 * * .. Scalar Arguments .. CHARACTER*( * ) NAME, OPTS @@ -271,7 +271,16 @@ * NB = 1 * - IF( C2.EQ.'GE' ) THEN + IF( SUBNAM(2:6).EQ.'LAORH' ) THEN +* +* This is for *LAORHR_GETRFNP routine +* + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF + ELSE IF( C2.EQ.'GE' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN NB = 64 diff --git a/lapack-netlib/SRC/ilaenv2stage.f b/lapack-netlib/SRC/ilaenv2stage.f index 3c0d34a12..db30a1b4d 100644 --- a/lapack-netlib/SRC/ilaenv2stage.f +++ b/lapack-netlib/SRC/ilaenv2stage.f @@ -39,9 +39,9 @@ *> *> ILAENV2STAGE returns an INTEGER *> if ILAENV2STAGE >= 0: ILAENV2STAGE returns the value of the parameter -* specified by ISPEC +*> specified by ISPEC *> if ILAENV2STAGE < 0: if ILAENV2STAGE = -k, the k-th argument had an -* illegal value. +*> illegal value. *> *> This version provides a set of parameters which should give good, *> but not optimal, performance on many of the currently available diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F index 836e20eed..1a37300a7 100644 --- a/lapack-netlib/SRC/iparam2stage.F +++ b/lapack-netlib/SRC/iparam2stage.F @@ -35,7 +35,7 @@ *> \verbatim *> *> This program sets problem and machine dependent parameters -*> useful for xHETRD_2STAGE, xHETRD_H@2HB, xHETRD_HB2ST, +*> useful for xHETRD_2STAGE, xHETRD_HE2HB, xHETRD_HB2ST, *> xGEBRD_2STAGE, xGEBRD_GE2GB, xGEBRD_GB2BD *> and related subroutines for eigenvalue problems. *> It is called whenever ILAENV is called with 17 <= ISPEC <= 21. @@ -53,7 +53,7 @@ *> return. *> *> ISPEC=17: the optimal blocksize nb for the reduction to -* BAND +*> BAND *> *> ISPEC=18: the optimal blocksize ib for the eigenvectors *> singular vectors update routine @@ -90,14 +90,14 @@ *> \param[in] NBI *> \verbatim *> NBI is INTEGER which is the used in the reduciton, -* (e.g., the size of the band), needed to compute workspace -* and LHOUS2. +*> (e.g., the size of the band), needed to compute workspace +*> and LHOUS2. *> \endverbatim *> *> \param[in] IBI *> \verbatim *> IBI is INTEGER which represent the IB of the reduciton, -* needed to compute workspace and LHOUS2. +*> needed to compute workspace and LHOUS2. *> \endverbatim *> *> \param[in] NXI diff --git a/lapack-netlib/SRC/iparmq.f b/lapack-netlib/SRC/iparmq.f index a9212b3e0..bb711243d 100644 --- a/lapack-netlib/SRC/iparmq.f +++ b/lapack-netlib/SRC/iparmq.f @@ -60,7 +60,7 @@ *> invest in an (expensive) multi-shift QR sweep. *> If the aggressive early deflation subroutine *> finds LD converged eigenvalues from an order -*> NW deflation window and LD.GT.(NW*NIBBLE)/100, +*> NW deflation window and LD > (NW*NIBBLE)/100, *> then the next QR sweep is skipped and early *> deflation is applied immediately to the *> remaining active diagonal block. Setting @@ -184,8 +184,8 @@ *> This depends on ILO, IHI and NS, the *> number of simultaneous shifts returned *> by IPARMQ(ISPEC=15). The default for -*> (IHI-ILO+1).LE.500 is NS. The default -*> for (IHI-ILO+1).GT.500 is 3*NS/2. +*> (IHI-ILO+1) <= 500 is NS. The default +*> for (IHI-ILO+1) > 500 is 3*NS/2. *> *> IPARMQ(ISPEC=14) Nibble crossover point. Default: 14. *> diff --git a/lapack-netlib/SRC/meson.build b/lapack-netlib/SRC/meson.build new file mode 100644 index 000000000..bad682401 --- /dev/null +++ b/lapack-netlib/SRC/meson.build @@ -0,0 +1,11 @@ +ALLAUX = files('ilaenv.f', 'ilaenv2stage.f', 'ieeeck.f', 'lsamen.f', 'xerbla.f', 'xerbla_array.f', 'iparmq.f', 'iparam2stage.F', 'ilaprec.f', 'ilatrans.f', 'ilauplo.f', 'iladiag.f', 'chla_transtype.f', '../INSTALL/ilaver.f', '../INSTALL/lsame.f', '../INSTALL/slamch.f') + + +SCLAUX = files('sbdsdc.f', 'sbdsqr.f', 'sdisna.f', 'slabad.f', 'slacpy.f', 'sladiv.f', 'slae2.f', 'slaebz.f', 'slaed0.f', 'slaed1.f', 'slaed2.f', 'slaed3.f', 'slaed4.f', 'slaed5.f', 'slaed6.f', 'slaed7.f', 'slaed8.f', 'slaed9.f', 'slaeda.f', 'slaev2.f', 'slagtf.f', 'slagts.f', 'slamrg.f', 'slanst.f', 'slapy2.f', 'slapy3.f', 'slarnv.f', 'slarra.f', 'slarrb.f', 'slarrc.f', 'slarrd.f', 'slarre.f', 'slarrf.f', 'slarrj.f', 'slarrk.f', 'slarrr.f', 'slaneg.f', 'slartg.f', 'slaruv.f', 'slas2.f', 'slascl.f', 'slasd0.f', 'slasd1.f', 'slasd2.f', 'slasd3.f', 'slasd4.f', 'slasd5.f', 'slasd6.f', 'slasd7.f', 'slasd8.f', 'slasda.f', 'slasdq.f', 'slasdt.f', 'slaset.f', 'slasq1.f', 'slasq2.f', 'slasq3.f', 'slasq4.f', 'slasq5.f', 'slasq6.f', 'slasr.f', 'slasrt.f', 'slassq.f', 'slasv2.f', 'spttrf.f', 'sstebz.f', 'sstedc.f', 'ssteqr.f', 'ssterf.f', 'slaisnan.f', 'sisnan.f', 'slartgp.f', 'slartgs.f', '../INSTALL/second_INT_CPU_TIME.f') + +DZLAUX = files('dbdsdc.f', 'dbdsqr.f', 'ddisna.f', 'dlabad.f', 'dlacpy.f', 'dladiv.f', 'dlae2.f', 'dlaebz.f', 'dlaed0.f', 'dlaed1.f', 'dlaed2.f', 'dlaed3.f', 'dlaed4.f', 'dlaed5.f', 'dlaed6.f', 'dlaed7.f', 'dlaed8.f', 'dlaed9.f', 'dlaeda.f', 'dlaev2.f', 'dlagtf.f', 'dlagts.f', 'dlamrg.f', 'dlanst.f', 'dlapy2.f', 'dlapy3.f', 'dlarnv.f', 'dlarra.f', 'dlarrb.f', 'dlarrc.f', 'dlarrd.f', 'dlarre.f', 'dlarrf.f', 'dlarrj.f', 'dlarrk.f', 'dlarrr.f', 'dlaneg.f', 'dlartg.f', 'dlaruv.f', 'dlas2.f', 'dlascl.f', 'dlasd0.f', 'dlasd1.f', 'dlasd2.f', 'dlasd3.f', 'dlasd4.f', 'dlasd5.f', 'dlasd6.f', 'dlasd7.f', 'dlasd8.f', 'dlasda.f', 'dlasdq.f', 'dlasdt.f', 'dlaset.f', 'dlasq1.f', 'dlasq2.f', 'dlasq3.f', 'dlasq4.f', 'dlasq5.f', 'dlasq6.f', 'dlasr.f', 'dlasrt.f', 'dlassq.f', 'dlasv2.f', 'dpttrf.f', 'dstebz.f', 'dstedc.f', 'dsteqr.f', 'dsterf.f', 'dlaisnan.f', 'disnan.f', 'dlartgp.f', 'dlartgs.f', '../INSTALL/dlamch.f', '../INSTALL/dsecnd_INT_CPU_TIME.f') + + +SLASRC = files('sbdsvdx.f', 'spotrf2.f', 'sgetrf2.f', 'sgbbrd.f', 'sgbcon.f', 'sgbequ.f', 'sgbrfs.f', 'sgbsv.f', 'sgbsvx.f', 'sgbtf2.f', 'sgbtrf.f', 'sgbtrs.f', 'sgebak.f', 'sgebal.f', 'sgebd2.f', 'sgebrd.f', 'sgecon.f', 'sgeequ.f', 'sgees.f', 'sgeesx.f', 'sgeev.f', 'sgeevx.f', 'sgehd2.f', 'sgehrd.f', 'sgelq2.f', 'sgelqf.f', 'sgels.f', 'sgelsd.f', 'sgelss.f', 'sgelsy.f', 'sgeql2.f', 'sgeqlf.f', 'sgeqp3.f', 'sgeqr2.f', 'sgeqr2p.f', 'sgeqrf.f', 'sgeqrfp.f', 'sgerfs.f', 'sgerq2.f', 'sgerqf.f', 'sgesc2.f', 'sgesdd.f', 'sgesv.f', 'sgesvd.f', 'sgesvdx.f', 'sgesvx.f', 'sgetc2.f', 'sgetf2.f', 'sgetri.f', 'sggbak.f', 'sggbal.f', 'sgges.f', 'sgges3.f', 'sggesx.f', 'sggev.f', 'sggev3.f', 'sggevx.f', 'sggglm.f', 'sgghrd.f', 'sgghd3.f', 'sgglse.f', 'sggqrf.f', 'sggrqf.f', 'sggsvd3.f', 'sggsvp3.f', 'sgtcon.f', 'sgtrfs.f', 'sgtsv.f', 'sgtsvx.f', 'sgttrf.f', 'sgttrs.f', 'sgtts2.f', 'shgeqz.f', 'shsein.f', 'shseqr.f', 'slabrd.f', 'slacon.f', 'slacn2.f', 'slaein.f', 'slaexc.f', 'slag2.f', 'slags2.f', 'slagtm.f', 'slagv2.f', 'slahqr.f', 'slahr2.f', 'slaic1.f', 'slaln2.f', 'slals0.f', 'slalsa.f', 'slalsd.f', 'slangb.f', 'slange.f', 'slangt.f', 'slanhs.f', 'slansb.f', 'slansp.f', 'slansy.f', 'slantb.f', 'slantp.f', 'slantr.f', 'slanv2.f', 'slapll.f', 'slapmt.f', 'slaqgb.f', 'slaqge.f', 'slaqp2.f', 'slaqps.f', 'slaqsb.f', 'slaqsp.f', 'slaqsy.f', 'slaqr0.f', 'slaqr1.f', 'slaqr2.f', 'slaqr3.f', 'slaqr4.f', 'slaqr5.f', 'slaqtr.f', 'slar1v.f', 'slar2v.f', 'ilaslr.f', 'ilaslc.f', 'slarf.f', 'slarfb.f', 'slarfg.f', 'slarfgp.f', 'slarft.f', 'slarfx.f', 'slarfy.f', 'slargv.f', 'slarrv.f', 'slartv.f', 'slarz.f', 'slarzb.f', 'slarzt.f', 'slaswp.f', 'slasy2.f', 'slasyf.f', 'slasyf_rook.f', 'slasyf_rk.f', 'slatbs.f', 'slatdf.f', 'slatps.f', 'slatrd.f', 'slatrs.f', 'slatrz.f', 'slauu2.f', 'slauum.f', 'sopgtr.f', 'sopmtr.f', 'sorg2l.f', 'sorg2r.f', 'sorgbr.f', 'sorghr.f', 'sorgl2.f', 'sorglq.f', 'sorgql.f', 'sorgqr.f', 'sorgr2.f', 'sorgrq.f', 'sorgtr.f', 'sorm2l.f', 'sorm2r.f', 'sorm22.f', 'sormbr.f', 'sormhr.f', 'sorml2.f', 'sormlq.f', 'sormql.f', 'sormqr.f', 'sormr2.f', 'sormr3.f', 'sormrq.f', 'sormrz.f', 'sormtr.f', 'spbcon.f', 'spbequ.f', 'spbrfs.f', 'spbstf.f', 'spbsv.f', 'spbsvx.f', 'spbtf2.f', 'spbtrf.f', 'spbtrs.f', 'spocon.f', 'spoequ.f', 'sporfs.f', 'sposv.f', 'sposvx.f', 'spotf2.f', 'spotri.f', 'spstrf.f', 'spstf2.f', 'sppcon.f', 'sppequ.f', 'spprfs.f', 'sppsv.f', 'sppsvx.f', 'spptrf.f', 'spptri.f', 'spptrs.f', 'sptcon.f', 'spteqr.f', 'sptrfs.f', 'sptsv.f', 'sptsvx.f', 'spttrs.f', 'sptts2.f', 'srscl.f', 'ssbev.f', 'ssbevd.f', 'ssbevx.f', 'ssbgst.f', 'ssbgv.f', 'ssbgvd.f', 'ssbgvx.f', 'ssbtrd.f', 'sspcon.f', 'sspev.f', 'sspevd.f', 'sspevx.f', 'sspgst.f', 'sspgv.f', 'sspgvd.f', 'sspgvx.f', 'ssprfs.f', 'sspsv.f', 'sspsvx.f', 'ssptrd.f', 'ssptrf.f', 'ssptri.f', 'ssptrs.f', 'sstegr.f', 'sstein.f', 'sstev.f', 'sstevd.f', 'sstevr.f', 'sstevx.f', 'ssycon.f', 'ssyev.f', 'ssyevd.f', 'ssyevr.f', 'ssyevx.f', 'ssygs2.f', 'ssygst.f', 'ssygv.f', 'ssygvd.f', 'ssygvx.f', 'ssyrfs.f', 'ssysv.f', 'ssysvx.f', 'ssytd2.f', 'ssytf2.f', 'ssytrd.f', 'ssytrf.f', 'ssytri.f', 'ssytri2.f', 'ssytri2x.f', 'ssyswapr.f', 'ssytrs.f', 'ssytrs2.f', 'ssyconv.f', 'ssyconvf.f', 'ssyconvf_rook.f', 'ssytf2_rook.f', 'ssytrf_rook.f', 'ssytrs_rook.f', 'ssytri_rook.f', 'ssycon_rook.f', 'ssysv_rook.f', 'ssytf2_rk.f', 'ssytrf_rk.f', 'ssytrs_3.f', 'ssytri_3.f', 'ssytri_3x.f', 'ssycon_3.f', 'ssysv_rk.f', 'slasyf_aa.f', 'ssysv_aa.f', 'ssytrf_aa.f', 'ssytrs_aa.f', 'ssysv_aa_2stage.f', 'ssytrf_aa_2stage.f', 'ssytrs_aa_2stage.f', 'stbcon.f', 'stbrfs.f', 'stbtrs.f', 'stgevc.f', 'stgex2.f', 'stgexc.f', 'stgsen.f', 'stgsja.f', 'stgsna.f', 'stgsy2.f', 'stgsyl.f', 'stpcon.f', 'stprfs.f', 'stptri.f', 'stptrs.f', 'strcon.f', 'strevc.f', 'strevc3.f', 'strexc.f', 'strrfs.f', 'strsen.f', 'strsna.f', 'strsyl.f', 'strti2.f', 'strtri.f', 'strtrs.f', 'stzrzf.f', 'sstemr.f', 'slansf.f', 'spftrf.f', 'spftri.f', 'spftrs.f', 'ssfrk.f', 'stfsm.f', 'stftri.f', 'stfttp.f', 'stfttr.f', 'stpttf.f', 'stpttr.f', 'strttf.f', 'strttp.f', 'sgejsv.f', 'sgesvj.f', 'sgsvj0.f', 'sgsvj1.f', 'sgeequb.f', 'ssyequb.f', 'spoequb.f', 'sgbequb.f', 'sbbcsd.f', 'slapmr.f', 'sorbdb.f', 'sorbdb1.f', 'sorbdb2.f', 'sorbdb3.f', 'sorbdb4.f', 'sorbdb5.f', 'sorbdb6.f', 'sorcsd.f', 'sorcsd2by1.f', 'sgeqrt.f', 'sgeqrt2.f', 'sgeqrt3.f', 'sgemqrt.f', 'stpqrt.f', 'stpqrt2.f', 'stpmqrt.f', 'stprfb.f', 'sgelqt.f', 'sgelqt3.f', 'sgemlqt.f', 'sgetsls.f', 'sgeqr.f', 'slatsqr.f', 'slamtsqr.f', 'sgemqr.f', 'sgelq.f', 'slaswlq.f', 'slamswlq.f', 'sgemlq.f', 'stplqt.f', 'stplqt2.f', 'stpmlqt.f', 'ssytrd_2stage.f', 'ssytrd_sy2sb.f', 'ssytrd_sb2st.F', 'ssb2st_kernels.f', 'ssyevd_2stage.f', 'ssyev_2stage.f', 'ssyevx_2stage.f', 'ssyevr_2stage.f', 'ssbev_2stage.f', 'ssbevx_2stage.f', 'ssbevd_2stage.f', 'ssygv_2stage.f', 'sgesvdq.f', 'scombssq.f') + +DSLASRC = files('spotrs.f', 'sgetrs.f', 'spotrf.f', 'sgetrf.f') diff --git a/lapack-netlib/SRC/sbdsvdx.f b/lapack-netlib/SRC/sbdsvdx.f index a4b1887b2..c46674c47 100644 --- a/lapack-netlib/SRC/sbdsvdx.f +++ b/lapack-netlib/SRC/sbdsvdx.f @@ -165,7 +165,7 @@ *> *> \param[out] Z *> \verbatim -*> Z is REAL array, dimension (2*N,K) ) +*> Z is REAL array, dimension (2*N,K) *> If JOBZ = 'V', then if INFO = 0 the first NS columns of Z *> contain the singular vectors of the matrix B corresponding to *> the selected singular values, with U in rows 1 to N and V diff --git a/lapack-netlib/SRC/scombssq.f b/lapack-netlib/SRC/scombssq.f new file mode 100644 index 000000000..76bc0e320 --- /dev/null +++ b/lapack-netlib/SRC/scombssq.f @@ -0,0 +1,92 @@ +*> \brief \b SCOMBSSQ adds two scaled sum of squares quantities +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE SCOMBSSQ( V1, V2 ) +* +* .. Array Arguments .. +* REAL V1( 2 ), V2( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCOMBSSQ adds two scaled sum of squares quantities, V1 := V1 + V2. +*> That is, +*> +*> V1_scale**2 * V1_sumsq := V1_scale**2 * V1_sumsq +*> + V2_scale**2 * V2_sumsq +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in,out] V1 +*> \verbatim +*> V1 is REAL array, dimension (2). +*> The first scaled sum. +*> V1(1) = V1_scale, V1(2) = V1_sumsq. +*> \endverbatim +*> +*> \param[in] V2 +*> \verbatim +*> V2 is REAL array, dimension (2). +*> The second scaled sum. +*> V2(1) = V2_scale, V2(2) = V2_sumsq. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2018 +* +*> \ingroup OTHERauxiliary +* +* ===================================================================== + SUBROUTINE SCOMBSSQ( V1, V2 ) +* +* -- LAPACK auxiliary routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2018 +* +* .. Array Arguments .. + REAL V1( 2 ), V2( 2 ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* + IF( V1( 1 ).GE.V2( 1 ) ) THEN + IF( V1( 1 ).NE.ZERO ) THEN + V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) + END IF + ELSE + V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) + V1( 1 ) = V2( 1 ) + END IF + RETURN +* +* End of SCOMBSSQ +* + END diff --git a/lapack-netlib/SRC/sgbrfsx.f b/lapack-netlib/SRC/sgbrfsx.f index 032b78b80..78ae584e1 100644 --- a/lapack-netlib/SRC/sgbrfsx.f +++ b/lapack-netlib/SRC/sgbrfsx.f @@ -308,7 +308,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -344,14 +344,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -359,9 +359,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/sgbsvxx.f b/lapack-netlib/SRC/sgbsvxx.f index b2132325e..3c3d737b3 100644 --- a/lapack-netlib/SRC/sgbsvxx.f +++ b/lapack-netlib/SRC/sgbsvxx.f @@ -431,7 +431,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -467,14 +467,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -482,9 +482,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/sgebak.f b/lapack-netlib/SRC/sgebak.f index ec58bf335..5c64c8b97 100644 --- a/lapack-netlib/SRC/sgebak.f +++ b/lapack-netlib/SRC/sgebak.f @@ -47,10 +47,10 @@ *> \verbatim *> JOB is CHARACTER*1 *> Specifies the type of backward transformation required: -*> = 'N', do nothing, return immediately; -*> = 'P', do backward transformation for permutation only; -*> = 'S', do backward transformation for scaling only; -*> = 'B', do backward transformations for both permutation and +*> = 'N': do nothing, return immediately; +*> = 'P': do backward transformation for permutation only; +*> = 'S': do backward transformation for scaling only; +*> = 'B': do backward transformations for both permutation and *> scaling. *> JOB must be the same as the argument JOB supplied to SGEBAL. *> \endverbatim diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f index c90de9b81..5ffa3bc37 100644 --- a/lapack-netlib/SRC/sgeesx.f +++ b/lapack-netlib/SRC/sgeesx.f @@ -583,7 +583,9 @@ IF( N.GT.I+1 ) $ CALL SSWAP( N-I-1, A( I, I+2 ), LDA, $ A( I+1, I+2 ), LDA ) - CALL SSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 ) + IF( WANTVS ) THEN + CALL SSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 ) + END IF A( I, I+1 ) = A( I+1, I ) A( I+1, I ) = ZERO END IF diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f index e4cbe8d0e..4ad316d99 100644 --- a/lapack-netlib/SRC/sgejsv.f +++ b/lapack-netlib/SRC/sgejsv.f @@ -82,7 +82,7 @@ *> desirable, then this option is advisable. The input matrix A *> is preprocessed with QR factorization with FULL (row and *> column) pivoting. -*> = 'G' Computation as with 'F' with an additional estimate of the +*> = 'G': Computation as with 'F' with an additional estimate of the *> condition number of B, where A=D*B. If A has heavily weighted *> rows, then using this condition number gives too pessimistic *> error bound. @@ -133,7 +133,7 @@ *> specified range. If A .NE. 0 is scaled so that the largest singular *> value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues *> the licence to kill columns of A whose norm in c*A is less than -*> SQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN, +*> SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, *> where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E'). *> = 'N': Do not kill small columns of c*A. This option assumes that *> BLAS and QR factorizations and triangular solvers are @@ -230,7 +230,7 @@ *> If JOBU = 'F', then U contains on exit the M-by-M matrix of *> the left singular vectors, including an ONB *> of the orthogonal complement of the Range(A). -*> If JOBU = 'W' .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N), +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), *> then U is used as workspace if the procedure *> replaces A with A^t. In that case, [V] is computed *> in U as left singular vectors of A^t and then @@ -252,7 +252,7 @@ *> V is REAL array, dimension ( LDV, N ) *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; -*> If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N), +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), *> then V is used as workspace if the pprocedure *> replaces A with A^t. In that case, [U] is computed *> in V as right singular vectors of A^t and then @@ -278,7 +278,7 @@ *> of A. (See the description of SVA().) *> WORK(2) = See the description of WORK(1). *> WORK(3) = SCONDA is an estimate for the condition number of -*> column equilibrated A. (If JOBA .EQ. 'E' or 'G') +*> column equilibrated A. (If JOBA = 'E' or 'G') *> SCONDA is an estimate of SQRT(||(R^t * R)^(-1)||_1). *> It is computed using SPOCON. It holds *> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA @@ -297,7 +297,7 @@ *> triangular factor in the first QR factorization. *> WORK(5) = an estimate of the scaled condition number of the *> triangular factor in the second QR factorization. -*> The following two parameters are computed if JOBT .EQ. 'T'. +*> The following two parameters are computed if JOBT = 'T'. *> They are provided for a developer/implementer who is familiar *> with the details of the method. *> @@ -313,8 +313,8 @@ *> Length of WORK to confirm proper allocation of work space. *> LWORK depends on the job: *> -*> If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and -*> -> .. no scaled condition estimate required (JOBE.EQ.'N'): +*> If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and +*> -> .. no scaled condition estimate required (JOBE = 'N'): *> LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement. *> ->> For optimal performance (blocked code) the optimal value *> is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal @@ -330,7 +330,7 @@ *> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), *> N+N*N+LWORK(DPOCON),7). *> -*> If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'), +*> If SIGMA and the right singular vectors are needed (JOBV = 'V'), *> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). *> -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7), *> where NB is the optimal block size for DGEQP3, DGEQRF, DGELQ, @@ -341,19 +341,19 @@ *> If SIGMA and the left singular vectors are needed *> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). *> -> For optimal performance: -*> if JOBU.EQ.'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7), -*> if JOBU.EQ.'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7), +*> if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7), +*> if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7), *> where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR. *> In general, the optimal length LWORK is computed as *> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON), *> 2*N+LWORK(DGEQRF), N+LWORK(DORMQR)). -*> Here LWORK(DORMQR) equals N*NB (for JOBU.EQ.'U') or -*> M*NB (for JOBU.EQ.'F'). +*> Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or +*> M*NB (for JOBU = 'F'). *> -*> If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and -*> -> if JOBV.EQ.'V' +*> If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> -> if JOBV = 'V' *> the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N). -*> -> if JOBV.EQ.'J' the minimal requirement is +*> -> if JOBV = 'J' the minimal requirement is *> LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6). *> -> For optimal performance, LWORK should be additionally *> larger than N+M*NB, where NB is the optimal block size @@ -369,7 +369,7 @@ *> of JOBA and JOBR. *> IWORK(2) = the number of the computed nonzero singular values *> IWORK(3) = if nonzero, a warning message: -*> If IWORK(3).EQ.1 then some of the column norms of A +*> If IWORK(3) = 1 then some of the column norms of A *> were denormalized floats. The requested high accuracy *> is not warranted by the data. *> \endverbatim @@ -377,10 +377,10 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> < 0 : if INFO = -i, then the i-th argument had an illegal value. -*> = 0 : successful exit; -*> > 0 : SGEJSV did not converge in the maximal allowed number -*> of sweeps. The computed values may be inaccurate. +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: SGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. *> \endverbatim * * Authors: @@ -953,7 +953,7 @@ IF ( L2ABER ) THEN * Standard absolute error bound suffices. All sigma_i with * sigma_i < N*EPSLN*||A|| are flushed to zero. This is an -* agressive enforcement of lower numerical rank by introducing a +* aggressive enforcement of lower numerical rank by introducing a * backward error of the order of N*EPSLN*||A||. TEMP1 = SQRT(FLOAT(N))*EPSLN DO 3001 p = 2, N @@ -965,7 +965,7 @@ 3001 CONTINUE 3002 CONTINUE ELSE IF ( L2RANK ) THEN -* .. similarly as above, only slightly more gentle (less agressive). +* .. similarly as above, only slightly more gentle (less aggressive). * Sudden drop on the diagonal of R1 is used as the criterion for * close-to-rank-deficient. TEMP1 = SQRT(SFMIN) @@ -1294,7 +1294,7 @@ CALL SPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1, $ WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR) CONDR1 = ONE / SQRT(TEMP1) -* .. here need a second oppinion on the condition number +* .. here need a second opinion on the condition number * .. then assume worst case scenario * R1 is OK for inverse <=> CONDR1 .LT. FLOAT(N) * more conservative <=> CONDR1 .LT. SQRT(FLOAT(N)) @@ -1335,7 +1335,7 @@ ELSE * * .. ill-conditioned case: second QRF with pivoting -* Note that windowed pivoting would be equaly good +* Note that windowed pivoting would be equally good * numerically, and more run-time efficient. So, in * an optimal implementation, the next call to SGEQP3 * should be replaced with eg. CALL SGEQPX (ACM TOMS #782) @@ -1388,7 +1388,7 @@ * IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 -* (this overwrittes the copy of R2, as it will not be +* (this overwrites the copy of R2, as it will not be * needed in this branch, but it does not overwritte the * Huseholder vectors of Q2.). CALL SLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) @@ -1638,7 +1638,7 @@ * * This branch deploys a preconditioned Jacobi SVD with explicitly * accumulated rotations. It is included as optional, mainly for -* experimental purposes. It does perfom well, and can also be used. +* experimental purposes. It does perform well, and can also be used. * In this implementation, this branch will be automatically activated * if the condition number sigma_max(A) / sigma_min(A) is predicted * to be greater than the overflow threshold. This is because the diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 4fe4d191d..96c4097e8 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -1,3 +1,4 @@ +*> \brief \b SGELQ * * Definition: * =========== @@ -17,7 +18,17 @@ * ============= *> *> \verbatim -*> SGELQ computes a LQ factorization of an M-by-N matrix A. +*> +*> SGELQ computes an LQ factorization of a real M-by-N matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -138,7 +149,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -159,10 +170,10 @@ SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/sgelq2.f b/lapack-netlib/SRC/sgelq2.f index 5b1ad215a..df8128d53 100644 --- a/lapack-netlib/SRC/sgelq2.f +++ b/lapack-netlib/SRC/sgelq2.f @@ -33,8 +33,16 @@ *> *> \verbatim *> -*> SGELQ2 computes an LQ factorization of a real m by n matrix A: -*> A = L * Q. +*> SGELQ2 computes an LQ factorization of a real m-by-n matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a n-by-n orthogonal matrix; +*> L is an lower-triangular m-by-m matrix; +*> 0 is a m-by-(n-m) zero matrix, if m < n. +*> *> \endverbatim * * Arguments: @@ -96,7 +104,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup realGEcomputational * @@ -121,10 +129,10 @@ * ===================================================================== SUBROUTINE SGELQ2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f index 99c03c0a3..90357e623 100644 --- a/lapack-netlib/SRC/sgelqf.f +++ b/lapack-netlib/SRC/sgelqf.f @@ -34,7 +34,15 @@ *> \verbatim *> *> SGELQF computes an LQ factorization of a real M-by-N matrix A: -*> A = L * Q. +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -110,7 +118,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup realGEcomputational * @@ -135,10 +143,10 @@ * ===================================================================== SUBROUTINE SGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/sgelqt.f b/lapack-netlib/SRC/sgelqt.f index 9a93af332..64d46025c 100644 --- a/lapack-netlib/SRC/sgelqt.f +++ b/lapack-netlib/SRC/sgelqt.f @@ -1,3 +1,5 @@ +*> \brief \b SGELQT +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f index 292ae88a3..edf5d6d30 100644 --- a/lapack-netlib/SRC/sgelqt3.f +++ b/lapack-netlib/SRC/sgelqt3.f @@ -1,3 +1,5 @@ +*> \brief \b SGELQT3 +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f index dedbe7752..5f2e02a8e 100644 --- a/lapack-netlib/SRC/sgemlq.f +++ b/lapack-netlib/SRC/sgemlq.f @@ -1,3 +1,4 @@ +*> \brief \b SGEMLQ * * Definition: * =========== @@ -143,7 +144,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/sgemlqt.f b/lapack-netlib/SRC/sgemlqt.f index a8f022bdc..37850fdf5 100644 --- a/lapack-netlib/SRC/sgemlqt.f +++ b/lapack-netlib/SRC/sgemlqt.f @@ -1,3 +1,5 @@ +*> \brief \b SGEMLQT +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f index 307fc8ca9..66c5117c9 100644 --- a/lapack-netlib/SRC/sgemqr.f +++ b/lapack-netlib/SRC/sgemqr.f @@ -1,3 +1,4 @@ +*> \brief \b SGEMQR * * Definition: * =========== @@ -144,7 +145,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f index f939abd9d..4a6bb9ea5 100644 --- a/lapack-netlib/SRC/sgeqr.f +++ b/lapack-netlib/SRC/sgeqr.f @@ -1,3 +1,4 @@ +*> \brief \b SGEQR * * Definition: * =========== @@ -17,7 +18,18 @@ * ============= *> *> \verbatim -*> SGEQR computes a QR factorization of an M-by-N matrix A. +*> +*> SGEQR computes a QR factorization of a real M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -138,7 +150,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -160,10 +172,10 @@ SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/sgeqr2.f b/lapack-netlib/SRC/sgeqr2.f index 3b990f825..0a1ff304f 100644 --- a/lapack-netlib/SRC/sgeqr2.f +++ b/lapack-netlib/SRC/sgeqr2.f @@ -33,8 +33,17 @@ *> *> \verbatim *> -*> SGEQR2 computes a QR factorization of a real m by n matrix A: -*> A = Q * R. +*> SGEQR2 computes a QR factorization of a real m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -96,7 +105,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup realGEcomputational * @@ -121,10 +130,10 @@ * ===================================================================== SUBROUTINE SGEQR2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/sgeqr2p.f b/lapack-netlib/SRC/sgeqr2p.f index f48af9d2d..08d124797 100644 --- a/lapack-netlib/SRC/sgeqr2p.f +++ b/lapack-netlib/SRC/sgeqr2p.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> SGEQR2P computes a QR factorization of a real m by n matrix A: -*> A = Q * R. The diagonal entries of R are nonnegative. +*> SGEQR2P computes a QR factorization of a real m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix with nonnegative diagonal +*> entries; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -97,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup realGEcomputational * @@ -124,10 +134,10 @@ * ===================================================================== SUBROUTINE SGEQR2P( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f index 0f79c2ca5..7df495e04 100644 --- a/lapack-netlib/SRC/sgeqrf.f +++ b/lapack-netlib/SRC/sgeqrf.f @@ -34,7 +34,16 @@ *> \verbatim *> *> SGEQRF computes a QR factorization of a real M-by-N matrix A: -*> A = Q * R. +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -111,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup realGEcomputational * @@ -136,10 +145,10 @@ * ===================================================================== SUBROUTINE SGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f index 654c0a13a..7f6741570 100644 --- a/lapack-netlib/SRC/sgeqrfp.f +++ b/lapack-netlib/SRC/sgeqrfp.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> SGEQRFP computes a QR factorization of a real M-by-N matrix A: -*> A = Q * R. The diagonal entries of R are nonnegative. +*> SGEQR2P computes a QR factorization of a real M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix with nonnegative diagonal +*> entries; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -112,7 +122,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup realGEcomputational * @@ -139,10 +149,10 @@ * ===================================================================== SUBROUTINE SGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/sgerfsx.f b/lapack-netlib/SRC/sgerfsx.f index 3f518899e..b1a1eb13d 100644 --- a/lapack-netlib/SRC/sgerfsx.f +++ b/lapack-netlib/SRC/sgerfsx.f @@ -283,7 +283,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -319,14 +319,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -334,9 +334,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/sgesc2.f b/lapack-netlib/SRC/sgesc2.f index c78daa334..3a6f34584 100644 --- a/lapack-netlib/SRC/sgesc2.f +++ b/lapack-netlib/SRC/sgesc2.f @@ -90,7 +90,7 @@ *> \verbatim *> SCALE is REAL *> On exit, SCALE contains the scale factor. SCALE is chosen -*> 0 <= SCALE <= 1 to prevent owerflow in the solution. +*> 0 <= SCALE <= 1 to prevent overflow in the solution. *> \endverbatim * * Authors: @@ -151,7 +151,7 @@ * .. * .. Executable Statements .. * -* Set constant to control owerflow +* Set constant to control overflow * EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) / EPS diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f index 0ba2a78c7..689494dd1 100644 --- a/lapack-netlib/SRC/sgesdd.f +++ b/lapack-netlib/SRC/sgesdd.f @@ -322,7 +322,7 @@ * IF( WNTQN ) THEN * sbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6) -* keep 7*N for backwards compatability. +* keep 7*N for backwards compatibility. BDSPAC = 7*N ELSE BDSPAC = 3*N*N + 4*N @@ -448,7 +448,7 @@ * IF( WNTQN ) THEN * sbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6) -* keep 7*N for backwards compatability. +* keep 7*N for backwards compatibility. BDSPAC = 7*M ELSE BDSPAC = 3*M*M + 4*M diff --git a/lapack-netlib/SRC/sgesvdq.f b/lapack-netlib/SRC/sgesvdq.f new file mode 100644 index 000000000..73e34862f --- /dev/null +++ b/lapack-netlib/SRC/sgesvdq.f @@ -0,0 +1,1388 @@ +*> \brief SGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGESVDQ + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, +* S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, +* WORK, LWORK, RWORK, LRWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV +* INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK, +* INFO +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * ) +* REAL S( * ), RWORK( * ) +* INTEGER IWORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGESVDQ computes the singular value decomposition (SVD) of a real +*> M-by-N matrix A, where M >= N. The SVD of A is written as +*> [++] [xx] [x0] [xx] +*> A = U * SIGMA * V^*, [++] = [xx] * [ox] * [xx] +*> [++] [xx] +*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal +*> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements +*> of SIGMA are the singular values of A. The columns of U and V are the +*> left and the right singular vectors of A, respectively. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy in the computed SVD +*> = 'A' The requested accuracy corresponds to having the backward +*> error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F, +*> where EPS = SLAMCH('Epsilon'). This authorises CGESVDQ to +*> truncate the computed triangular factor in a rank revealing +*> QR factorization whenever the truncated part is below the +*> threshold of the order of EPS * ||A||_F. This is aggressive +*> truncation level. +*> = 'M' Similarly as with 'A', but the truncation is more gentle: it +*> is allowed only when there is a drop on the diagonal of the +*> triangular factor in the QR factorization. This is medium +*> truncation level. +*> = 'H' High accuracy requested. No numerical rank determination based +*> on the rank revealing QR factorization is attempted. +*> = 'E' Same as 'H', and in addition the condition number of column +*> scaled A is estimated and returned in RWORK(1). +*> N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1) +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> = 'P' The rows of A are ordered in decreasing order with respect to +*> ||A(i,:)||_\infty. This enhances numerical accuracy at the cost +*> of extra data movement. Recommended for numerical robustness. +*> = 'N' No row pivoting. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> = 'T' After the initial pivoted QR factorization, SGESVD is applied to +*> the transposed R**T of the computed triangular factor R. This involves +*> some extra data movement (matrix transpositions). Useful for +*> experiments, research and development. +*> = 'N' The triangular factor R is given as input to SGESVD. This may be +*> preferred as it involves less data movement. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> = 'A' All M left singular vectors are computed and returned in the +*> matrix U. See the description of U. +*> = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned +*> in the matrix U. See the description of U. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular +*> vectors are computed and returned in the matrix U. +*> = 'F' The N left singular vectors are returned in factored form as the +*> product of the Q factor from the initial QR factorization and the +*> N left singular vectors of (R**T , 0)**T. If row pivoting is used, +*> then the necessary information on the row pivoting is stored in +*> IWORK(N+1:N+M-1). +*> = 'N' The left singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> = 'A', 'V' All N right singular vectors are computed and returned in +*> the matrix V. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular +*> vectors are computed and returned in the matrix V. This option is +*> allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal. +*> = 'N' The right singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array of dimensions LDA x N +*> On entry, the input matrix A. +*> On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains +*> the Householder vectors as stored by SGEQP3. If JOBU = 'F', these Householder +*> vectors together with WORK(1:N) can be used to restore the Q factors from +*> the initial pivoted QR factorization of A. See the description of U. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER. +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array of dimension N. +*> The singular values of A, ordered so that S(i) >= S(i+1). +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is REAL array, dimension +*> LDU x M if JOBU = 'A'; see the description of LDU. In this case, +*> on exit, U contains the M left singular vectors. +*> LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this +*> case, U contains the leading N or the leading NUMRANK left singular vectors. +*> LDU x N if JOBU = 'F' ; see the description of LDU. In this case U +*> contains N x N orthogonal matrix that can be used to form the left +*> singular vectors. +*> If JOBU = 'N', U is not referenced. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER. +*> The leading dimension of the array U. +*> If JOBU = 'A', 'S', 'U', 'R', LDU >= max(1,M). +*> If JOBU = 'F', LDU >= max(1,N). +*> Otherwise, LDU >= 1. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is REAL array, dimension +*> LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' . +*> If JOBV = 'A', or 'V', V contains the N-by-N orthogonal matrix V**T; +*> If JOBV = 'R', V contains the first NUMRANK rows of V**T (the right +*> singular vectors, stored rowwise, of the NUMRANK largest singular values). +*> If JOBV = 'N' and JOBA = 'E', V is used as a workspace. +*> If JOBV = 'N', and JOBA.NE.'E', V is not referenced. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If JOBV = 'A', 'V', 'R', or JOBA = 'E', LDV >= max(1,N). +*> Otherwise, LDV >= 1. +*> \endverbatim +*> +*> \param[out] NUMRANK +*> \verbatim +*> NUMRANK is INTEGER +*> NUMRANK is the numerical rank first determined after the rank +*> revealing QR factorization, following the strategy specified by the +*> value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK +*> leading singular values and vectors are then requested in the call +*> of SGESVD. The final value of NUMRANK might be further reduced if +*> some singular values are computed as zeros. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (max(1, LIWORK)). +*> On exit, IWORK(1:N) contains column pivoting permutation of the +*> rank revealing QR factorization. +*> If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence +*> of row swaps used in row pivoting. These can be used to restore the +*> left singular vectors in the case JOBU = 'F'. +*> +*> If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> LIWORK(1) returns the minimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> LIWORK >= N + M - 1, if JOBP = 'P' and JOBA .NE. 'E'; +*> LIWORK >= N if JOBP = 'N' and JOBA .NE. 'E'; +*> LIWORK >= N + M - 1 + N, if JOBP = 'P' and JOBA = 'E'; +*> LIWORK >= N + N if JOBP = 'N' and JOBA = 'E'. +* +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the WORK, IWORK, and RWORK arrays, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (max(2, LWORK)), used as a workspace. +*> On exit, if, on entry, LWORK.NE.-1, WORK(1:N) contains parameters +*> needed to recover the Q factor from the QR factorization computed by +*> SGEQP3. +*> +*> If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> WORK(1) returns the optimal LWORK, and +*> WORK(2) returns the minimal LWORK. +*> \endverbatim +*> +*> \param[in,out] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. It is determined as follows: +*> Let LWQP3 = 3*N+1, LWCON = 3*N, and let +*> LWORQ = { MAX( N, 1 ), if JOBU = 'R', 'S', or 'U' +*> { MAX( M, 1 ), if JOBU = 'A' +*> LWSVD = MAX( 5*N, 1 ) +*> LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 5*(N/2), 1 ), LWORLQ = MAX( N, 1 ), +*> LWQRF = MAX( N/2, 1 ), LWORQ2 = MAX( N, 1 ) +*> Then the minimal value of LWORK is: +*> = MAX( N + LWQP3, LWSVD ) if only the singular values are needed; +*> = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed, +*> and a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWORQ ) if the singular values and the left +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the singular values and the left +*> singular vectors are requested, and also +*> a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD ) if the singular values and the right +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right +*> singular vectors are requested, and also +*> a scaled condition etimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWORQ ) if the full SVD is requested with JOBV = 'R'; +*> independent of JOBR; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the full SVD is requested, +*> JOBV = 'R' and, also a scaled condition +*> estimate requested; independent of JOBR; +*> = MAX( N + MAX( LWQP3, LWSVD, LWORQ ), +*> N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ) ) if the +*> full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ ) ) +*> if the full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N', and also a scaled condition number estimate +*> requested. +*> = MAX( N + MAX( LWQP3, LWSVD, LWORQ ), +*> N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) ) if the +*> full SVD is requested with JOBV = 'A', 'V', and JOBR ='T' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) ) +*> if the full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='T', and also a scaled condition number estimate +*> requested. +*> Finally, LWORK must be at least two: LWORK = MAX( 2, LWORK ). +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the WORK, IWORK, and RWORK arrays, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (max(1, LRWORK)). +*> On exit, +*> 1. If JOBA = 'E', RWORK(1) contains an estimate of the condition +*> number of column scaled A. If A = C * D where D is diagonal and C +*> has unit columns in the Euclidean norm, then, assuming full column rank, +*> N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1). +*> Otherwise, RWORK(1) = -1. +*> 2. RWORK(2) contains the number of singular values computed as +*> exact zeros in SGESVD applied to the upper triangular or trapeziodal +*> R (from the initial QR factorization). In case of early exit (no call to +*> SGESVD, such as in the case of zero matrix) RWORK(2) = -1. +*> +*> If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> RWORK(1) returns the minimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER. +*> The dimension of the array RWORK. +*> If JOBP ='P', then LRWORK >= MAX(2, M). +*> Otherwise, LRWORK >= 2 +* +*> If LRWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the WORK, IWORK, and RWORK arrays, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if SBDSQR did not converge, INFO specifies how many superdiagonals +*> of an intermediate bidiagonal form B (computed in SGESVD) did not +*> converge to zero. +*> \endverbatim +* +*> \par Further Details: +* ======================== +*> +*> \verbatim +*> +*> 1. The data movement (matrix transpose) is coded using simple nested +*> DO-loops because BLAS and LAPACK do not provide corresponding subroutines. +*> Those DO-loops are easily identified in this source code - by the CONTINUE +*> statements labeled with 11**. In an optimized version of this code, the +*> nested DO loops should be replaced with calls to an optimized subroutine. +*> 2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause +*> column norm overflow. This is the minial precaution and it is left to the +*> SVD routine (CGESVD) to do its own preemptive scaling if potential over- +*> or underflows are detected. To avoid repeated scanning of the array A, +*> an optimal implementation would do all necessary scaling before calling +*> CGESVD and the scaling in CGESVD can be switched off. +*> 3. Other comments related to code optimization are given in comments in the +*> code, enlosed in [[double brackets]]. +*> \endverbatim +* +*> \par Bugs, examples and comments +* =========================== +* +*> \verbatim +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> \endverbatim +* +*> \par References +* =============== +* +*> \verbatim +*> [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for +*> Computing the SVD with High Accuracy. ACM Trans. Math. Softw. +*> 44(1): 11:1-11:30 (2017) +*> +*> SIGMA library, xGESVDQ section updated February 2016. +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2018 +* +*> \ingroup realGEsing +* +* ===================================================================== + SUBROUTINE SGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, + $ S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, + $ WORK, LWORK, RWORK, LRWORK, INFO ) +* .. Scalar Arguments .. + IMPLICIT NONE + CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV + INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK, + $ INFO +* .. +* .. Array Arguments .. + REAL A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * ) + REAL S( * ), RWORK( * ) + INTEGER IWORK( * ) +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) +* .. +* .. Local Scalars .. + INTEGER IERR, IWOFF, NR, N1, OPTRATIO, p, q + INTEGER LWCON, LWQP3, LWRK_SGELQF, LWRK_SGESVD, LWRK_SGESVD2, + $ LWRK_SGEQP3, LWRK_SGEQRF, LWRK_SORMLQ, LWRK_SORMQR, + $ LWRK_SORMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWORQ, + $ LWORQ2, LWUNLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2, + $ IMINWRK, RMINWRK + LOGICAL ACCLA, ACCLM, ACCLH, ASCALED, CONDA, DNTWU, DNTWV, + $ LQUERY, LSVC0, LSVEC, ROWPRM, RSVEC, RTRANS, WNTUA, + $ WNTUF, WNTUR, WNTUS, WNTVA, WNTVR + REAL BIG, EPSLN, RTMP, SCONDA, SFMIN +* .. +* .. Local Arrays + REAL RDUMMY(1) +* .. +* .. External Subroutines (BLAS, LAPACK) + EXTERNAL SGELQF, SGEQP3, SGEQRF, SGESVD, SLACPY, SLAPMT, + $ SLASCL, SLASET, SLASWP, SSCAL, SPOCON, SORMLQ, + $ SORMQR, XERBLA +* .. +* .. External Functions (BLAS, LAPACK) + LOGICAL LSAME + INTEGER ISAMAX + REAL SLANGE, SNRM2, SLAMCH + EXTERNAL SLANGE, LSAME, ISAMAX, SNRM2, SLAMCH +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, REAL, SQRT +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + WNTUS = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' ) + WNTUR = LSAME( JOBU, 'R' ) + WNTUA = LSAME( JOBU, 'A' ) + WNTUF = LSAME( JOBU, 'F' ) + LSVC0 = WNTUS .OR. WNTUR .OR. WNTUA + LSVEC = LSVC0 .OR. WNTUF + DNTWU = LSAME( JOBU, 'N' ) +* + WNTVR = LSAME( JOBV, 'R' ) + WNTVA = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' ) + RSVEC = WNTVR .OR. WNTVA + DNTWV = LSAME( JOBV, 'N' ) +* + ACCLA = LSAME( JOBA, 'A' ) + ACCLM = LSAME( JOBA, 'M' ) + CONDA = LSAME( JOBA, 'E' ) + ACCLH = LSAME( JOBA, 'H' ) .OR. CONDA +* + ROWPRM = LSAME( JOBP, 'P' ) + RTRANS = LSAME( JOBR, 'T' ) +* + IF ( ROWPRM ) THEN + IF ( CONDA ) THEN + IMINWRK = MAX( 1, N + M - 1 + N ) + ELSE + IMINWRK = MAX( 1, N + M - 1 ) + END IF + RMINWRK = MAX( 2, M ) + ELSE + IF ( CONDA ) THEN + IMINWRK = MAX( 1, N + N ) + ELSE + IMINWRK = MAX( 1, N ) + END IF + RMINWRK = 2 + END IF + LQUERY = (LIWORK .EQ. -1 .OR. LWORK .EQ. -1 .OR. LRWORK .EQ. -1) + INFO = 0 + IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN + INFO = -1 + ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = -2 + ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN + INFO = -3 + ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN + INFO = -4 + ELSE IF ( WNTUR .AND. WNTVA ) THEN + INFO = -5 + ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN + INFO = -5 + ELSE IF ( M.LT.0 ) THEN + INFO = -6 + ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN + INFO = -7 + ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN + INFO = -9 + ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR. + $ ( WNTUF .AND. LDU.LT.N ) ) THEN + INFO = -12 + ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR. + $ ( CONDA .AND. LDV.LT.N ) ) THEN + INFO = -14 + ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN + INFO = -17 + END IF +* +* + IF ( INFO .EQ. 0 ) THEN +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LWORK are written with a lot of redundancy and +* can be simplified. However, this detailed form is easier for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for SGEQP3 of an M x N matrix + LWQP3 = 3 * N + 1 +* .. minimal workspace length for SORMQR to build left singular vectors + IF ( WNTUS .OR. WNTUR ) THEN + LWORQ = MAX( N , 1 ) + ELSE IF ( WNTUA ) THEN + LWORQ = MAX( M , 1 ) + END IF +* .. minimal workspace length for SPOCON of an N x N matrix + LWCON = 3 * N +* .. SGESVD of an N x N matrix + LWSVD = MAX( 5 * N, 1 ) + IF ( LQUERY ) THEN + CALL SGEQP3( M, N, A, LDA, IWORK, RDUMMY, RDUMMY, -1, + $ IERR ) + LWRK_SGEQP3 = INT( RDUMMY(1) ) + IF ( WNTUS .OR. WNTUR ) THEN + CALL SORMQR( 'L', 'N', M, N, N, A, LDA, RDUMMY, U, + $ LDU, RDUMMY, -1, IERR ) + LWRK_SORMQR = INT( RDUMMY(1) ) + ELSE IF ( WNTUA ) THEN + CALL SORMQR( 'L', 'N', M, M, N, A, LDA, RDUMMY, U, + $ LDU, RDUMMY, -1, IERR ) + LWRK_SORMQR = INT( RDUMMY(1) ) + ELSE + LWRK_SORMQR = 0 + END IF + END IF + MINWRK = 2 + OPTWRK = 2 + IF ( .NOT. (LSVEC .OR. RSVEC )) THEN +* .. minimal and optimal sizes of the workspace if +* only the singular values are requested + IF ( CONDA ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + CALL SGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_SGESVD = INT( RDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = MAX( N+LWRK_SGEQP3, N+LWCON, LWRK_SGESVD ) + ELSE + OPTWRK = MAX( N+LWRK_SGEQP3, LWRK_SGESVD ) + END IF + END IF + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the workspace if the +* singular values and the left singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD, LWORQ ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL SGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + ELSE + CALL SGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + END IF + LWRK_SGESVD = INT( RDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_SGEQP3, LWCON, LWRK_SGESVD, + $ LWRK_SORMQR ) + ELSE + OPTWRK = N + MAX( LWRK_SGEQP3, LWRK_SGESVD, + $ LWRK_SORMQR ) + END IF + END IF + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the workspace if the +* singular values and the right singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL SGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + ELSE + CALL SGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + END IF + LWRK_SGESVD = INT( RDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_SGEQP3, LWCON, LWRK_SGESVD ) + ELSE + OPTWRK = N + MAX( LWRK_SGEQP3, LWRK_SGESVD ) + END IF + END IF + ELSE +* .. minimal and optimal sizes of the workspace if the +* full SVD is requested + IF ( RTRANS ) THEN + MINWRK = MAX( LWQP3, LWSVD, LWORQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N x N/2 SGEQRF + LWQRF = MAX( N/2, 1 ) +* .. minimal workspace lengt for N/2 x N/2 SGESVD + LWSVD2 = MAX( 5 * (N/2), 1 ) + LWORQ2 = MAX( N, 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, + $ N/2+LWORQ2, LWORQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + ELSE + MINWRK = MAX( LWQP3, LWSVD, LWORQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N/2 x N SGELQF + LWLQF = MAX( N/2, 1 ) + LWSVD2 = MAX( 5 * (N/2), 1 ) + LWUNLQ = MAX( N , 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, + $ N/2+LWUNLQ, LWORQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL SGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_SGESVD = INT( RDUMMY(1) ) + OPTWRK = MAX(LWRK_SGEQP3,LWRK_SGESVD,LWRK_SORMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL SGEQRF(N,N/2,U,LDU,RDUMMY,RDUMMY,-1,IERR) + LWRK_SGEQRF = INT( RDUMMY(1) ) + CALL SGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_SGESVD2 = INT( RDUMMY(1) ) + CALL SORMQR( 'R', 'C', N, N, N/2, U, LDU, RDUMMY, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_SORMQR2 = INT( RDUMMY(1) ) + OPTWRK2 = MAX( LWRK_SGEQP3, N/2+LWRK_SGEQRF, + $ N/2+LWRK_SGESVD2, N/2+LWRK_SORMQR2 ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + ELSE + CALL SGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_SGESVD = INT( RDUMMY(1) ) + OPTWRK = MAX(LWRK_SGEQP3,LWRK_SGESVD,LWRK_SORMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL SGELQF(N/2,N,U,LDU,RDUMMY,RDUMMY,-1,IERR) + LWRK_SGELQF = INT( RDUMMY(1) ) + CALL SGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU, + $ V, LDV, RDUMMY, -1, IERR ) + LWRK_SGESVD2 = INT( RDUMMY(1) ) + CALL SORMLQ( 'R', 'N', N, N, N/2, U, LDU, RDUMMY, + $ V, LDV, RDUMMY,-1,IERR ) + LWRK_SORMLQ = INT( RDUMMY(1) ) + OPTWRK2 = MAX( LWRK_SGEQP3, N/2+LWRK_SGELQF, + $ N/2+LWRK_SGESVD2, N/2+LWRK_SORMLQ ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + END IF + END IF + END IF +* + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( 2, OPTWRK ) + IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19 +* + END IF +* + IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN + INFO = -21 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGESVDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +* +* Return optimal workspace +* + IWORK(1) = IMINWRK + WORK(1) = OPTWRK + WORK(2) = MINWRK + RWORK(1) = RMINWRK + RETURN + END IF +* +* Quick return if the matrix is void. +* + IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN +* .. all output is void. + RETURN + END IF +* + BIG = SLAMCH('O') + ASCALED = .FALSE. + IWOFF = 1 + IF ( ROWPRM ) THEN + IWOFF = M +* .. reordering the rows in decreasing sequence in the +* ell-infinity norm - this enhances numerical robustness in +* the case of differently scaled rows. + DO 1904 p = 1, M +* RWORK(p) = ABS( A(p,ICAMAX(N,A(p,1),LDA)) ) +* [[SLANGE will return NaN if an entry of the p-th row is Nan]] + RWORK(p) = SLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY ) +* .. check for NaN's and Inf's + IF ( ( RWORK(p) .NE. RWORK(p) ) .OR. + $ ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN + INFO = -8 + CALL XERBLA( 'SGESVDQ', -INFO ) + RETURN + END IF + 1904 CONTINUE + DO 1952 p = 1, M - 1 + q = ISAMAX( M-p+1, RWORK(p), 1 ) + p - 1 + IWORK(N+p) = q + IF ( p .NE. q ) THEN + RTMP = RWORK(p) + RWORK(p) = RWORK(q) + RWORK(q) = RTMP + END IF + 1952 CONTINUE +* + IF ( RWORK(1) .EQ. ZERO ) THEN +* Quick return: A is the M x N zero matrix. + NUMRANK = 0 + CALL SLASET( 'G', N, 1, ZERO, ZERO, S, N ) + IF ( WNTUS ) CALL SLASET('G', M, N, ZERO, ONE, U, LDU) + IF ( WNTUA ) CALL SLASET('G', M, M, ZERO, ONE, U, LDU) + IF ( WNTVA ) CALL SLASET('G', N, N, ZERO, ONE, V, LDV) + IF ( WNTUF ) THEN + CALL SLASET( 'G', N, 1, ZERO, ZERO, WORK, N ) + CALL SLASET( 'G', M, N, ZERO, ONE, U, LDU ) + END IF + DO 5001 p = 1, N + IWORK(p) = p + 5001 CONTINUE + IF ( ROWPRM ) THEN + DO 5002 p = N + 1, N + M - 1 + IWORK(p) = p - N + 5002 CONTINUE + END IF + IF ( CONDA ) RWORK(1) = -1 + RWORK(2) = -1 + RETURN + END IF +* + IF ( RWORK(1) .GT. BIG / SQRT(REAL(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL SLASCL('G',0,0,SQRT(REAL(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + CALL SLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 ) + END IF +* +* .. At this stage, preemptive scaling is done only to avoid column +* norms overflows during the QR factorization. The SVD procedure should +* have its own scaling to save the singular values from overflows and +* underflows. That depends on the SVD procedure. +* + IF ( .NOT.ROWPRM ) THEN + RTMP = SLANGE( 'M', M, N, A, LDA, RDUMMY ) + IF ( ( RTMP .NE. RTMP ) .OR. + $ ( (RTMP*ZERO) .NE. ZERO ) ) THEN + INFO = -8 + CALL XERBLA( 'SGESVDQ', -INFO ) + RETURN + END IF + IF ( RTMP .GT. BIG / SQRT(REAL(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL SLASCL('G',0,0, SQRT(REAL(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + END IF +* +* .. QR factorization with column pivoting +* +* A * P = Q * [ R ] +* [ 0 ] +* + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL SGEQP3( M, N, A, LDA, IWORK, WORK, WORK(N+1), LWORK-N, + $ IERR ) +* +* If the user requested accuracy level allows truncation in the +* computed upper triangular factor, the matrix R is examined and, +* if possible, replaced with its leading upper trapezoidal part. +* + EPSLN = SLAMCH('E') + SFMIN = SLAMCH('S') +* SMALL = SFMIN / EPSLN + NR = N +* + IF ( ACCLA ) THEN +* +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPS*||A||_F are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPS*||A||_F. + NR = 1 + RTMP = SQRT(REAL(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002 + NR = NR + 1 + 3001 CONTINUE + 3002 CONTINUE +* + ELSEIF ( ACCLM ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R is used as the criterion for being +* close-to-rank-deficient. The threshold is set to EPSLN=SLAMCH('E'). +* [[This can be made more flexible by replacing this hard-coded value +* with a user specified threshold.]] Also, the values that underflow +* will be truncated. + NR = 1 + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* .. RRQR not authorized to determine numerical rank except in the +* obvious case of zero pivots. +* .. inspect R for exact zeros on the diagonal; +* R(i,i)=0 => R(i:N,i:N)=0. + NR = 1 + DO 3501 p = 2, N + IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502 + NR = NR + 1 + 3501 CONTINUE + 3502 CONTINUE +* + IF ( CONDA ) THEN +* Estimate the scaled condition number of A. Use the fact that it is +* the same as the scaled condition number of R. +* .. V is used as workspace + CALL SLACPY( 'U', N, N, A, LDA, V, LDV ) +* Only the leading NR x NR submatrix of the triangular factor +* is considered. Only if NR=N will this give a reliable error +* bound. However, even for NR < N, this can be used on an +* expert level and obtain useful information in the sense of +* perturbation theory. + DO 3053 p = 1, NR + RTMP = SNRM2( p, V(1,p), 1 ) + CALL SSCAL( p, ONE/RTMP, V(1,p), 1 ) + 3053 CONTINUE + IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN + CALL SPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ WORK, IWORK(N+IWOFF), IERR ) + ELSE + CALL SPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ WORK(N+1), IWORK(N+IWOFF), IERR ) + END IF + SCONDA = ONE / SQRT(RTMP) +* For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1), +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +* See the reference [1] for more details. + END IF +* + ENDIF +* + IF ( WNTUR ) THEN + N1 = NR + ELSE IF ( WNTUS .OR. WNTUF) THEN + N1 = N + ELSE IF ( WNTUA ) THEN + N1 = M + END IF +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +*....................................................................... +* .. only the singular values are requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. compute the singular values of R**T = [A](1:NR,1:N)**T +* .. set the lower triangle of [A] to [A](1:NR,1:N)**T and +* the upper triangle of [A] to zero. + DO 1146 p = 1, MIN( N, NR ) + DO 1147 q = p + 1, N + A(q,p) = A(p,q) + IF ( q .LE. NR ) A(p,q) = ZERO + 1147 CONTINUE + 1146 CONTINUE +* + CALL SGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU, + $ V, LDV, WORK, LWORK, INFO ) +* + ELSE +* +* .. compute the singular values of R = [A](1:NR,1:N) +* + IF ( NR .GT. 1 ) + $ CALL SLASET( 'L', NR-1,NR-1, ZERO,ZERO, A(2,1), LDA ) + CALL SGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU, + $ V, LDV, WORK, LWORK, INFO ) +* + END IF +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN +*....................................................................... +* .. the singular values and the left singular vectors requested +*......................................................................."""""""" + IF ( RTRANS ) THEN +* .. apply SGESVD to R**T +* .. copy R**T into [U] and overwrite [U] with the right singular +* vectors of R + DO 1192 p = 1, NR + DO 1193 q = p, N + U(q,p) = A(p,q) + 1193 CONTINUE + 1192 CONTINUE + IF ( NR .GT. 1 ) + $ CALL SLASET( 'U', NR-1,NR-1, ZERO,ZERO, U(1,2), LDU ) +* .. the left singular vectors not computed, the NR right singular +* vectors overwrite [U](1:NR,1:NR) as transposed. These +* will be pre-multiplied by Q to build the left singular vectors of A. + CALL SGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1119 p = 1, NR + DO 1120 q = p + 1, NR + RTMP = U(q,p) + U(q,p) = U(p,q) + U(p,q) = RTMP + 1120 CONTINUE + 1119 CONTINUE +* + ELSE +* .. apply SGESVD to R +* .. copy R into [U] and overwrite [U] with the left singular vectors + CALL SLACPY( 'U', NR, N, A, LDA, U, LDU ) + IF ( NR .GT. 1 ) + $ CALL SLASET( 'L', NR-1, NR-1, ZERO, ZERO, U(2,1), LDU ) +* .. the right singular vectors not computed, the NR left singular +* vectors overwrite [U](1:NR,1:NR) + CALL SGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) +* .. now [U](1:NR,1:NR) contains the NR left singular vectors of +* R. These will be pre-multiplied by Q to build the left singular +* vectors of A. + END IF +* +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN + CALL SLASET('A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL SLASET( 'A',NR,N1-NR,ZERO,ZERO,U(1,NR+1), LDU ) + CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT.WNTUF ) + $ CALL SORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL SLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* + ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN +*....................................................................... +* .. the singular values and the right singular vectors requested +*....................................................................... + IF ( RTRANS ) THEN +* .. apply SGESVD to R**T +* .. copy R**T into V and overwrite V with the left singular vectors + DO 1165 p = 1, NR + DO 1166 q = p, N + V(q,p) = (A(p,q)) + 1166 CONTINUE + 1165 CONTINUE + IF ( NR .GT. 1 ) + $ CALL SLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV ) +* .. the left singular vectors of R**T overwrite V, the right singular +* vectors not computed + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL SGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1121 p = 1, NR + DO 1122 q = p + 1, NR + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1122 CONTINUE + 1121 CONTINUE +* + IF ( NR .LT. N ) THEN + DO 1103 p = 1, NR + DO 1104 q = NR + 1, N + V(p,q) = V(q,p) + 1104 CONTINUE + 1103 CONTINUE + END IF + CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK ) + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:N,1:NR) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the QR factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL SLASET('G', N, N-NR, ZERO, ZERO, V(1,NR+1), LDV) + CALL SGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1123 p = 1, N + DO 1124 q = p + 1, N + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1124 CONTINUE + 1123 CONTINUE + CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* + ELSE +* .. aply SGESVD to R +* .. copy R into V and overwrite V with the right singular vectors + CALL SLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL SLASET( 'L', NR-1, NR-1, ZERO, ZERO, V(2,1), LDV ) +* .. the right singular vectors overwrite V, the NR left singular +* vectors stored in U(1:NR,1:NR) + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL SGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:NR,1:N) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the LQ factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL SLASET('G', N-NR, N, ZERO,ZERO, V(NR+1,1), LDV) + CALL SGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* .. now [V] contains the transposed matrix of the right singular +* vectors of A. + END IF +* + ELSE +*....................................................................... +* .. FULL SVD requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. apply SGESVD to R**T [[this option is left for R&D&T]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R**T into [V] and overwrite [V] with the left singular +* vectors of R**T + DO 1168 p = 1, NR + DO 1169 q = p, N + V(q,p) = A(p,q) + 1169 CONTINUE + 1168 CONTINUE + IF ( NR .GT. 1 ) + $ CALL SLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV ) +* +* .. the left singular vectors of R**T overwrite [V], the NR right +* singular vectors of R**T stored in [U](1:NR,1:NR) as transposed + CALL SGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* .. assemble V + DO 1115 p = 1, NR + DO 1116 q = p + 1, NR + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1116 CONTINUE + 1115 CONTINUE + IF ( NR .LT. N ) THEN + DO 1101 p = 1, NR + DO 1102 q = NR+1, N + V(p,q) = V(q,p) + 1102 CONTINUE + 1101 CONTINUE + END IF + CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* + DO 1117 p = 1, NR + DO 1118 q = p + 1, NR + RTMP = U(q,p) + U(q,p) = U(p,q) + U(p,q) = RTMP + 1118 CONTINUE + 1117 CONTINUE +* + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL SLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. copy R**T into [V] and overwrite [V] with the left singular +* vectors of R**T +* [[The optimal ratio N/NR for using QRF instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'SGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO*NR .GT. N ) THEN + DO 1198 p = 1, NR + DO 1199 q = p, N + V(q,p) = A(p,q) + 1199 CONTINUE + 1198 CONTINUE + IF ( NR .GT. 1 ) + $ CALL SLASET('U',NR-1,NR-1, ZERO,ZERO, V(1,2),LDV) +* + CALL SLASET('A',N,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL SGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV, + $ U, LDU, WORK(N+1), LWORK-N, INFO ) +* + DO 1113 p = 1, N + DO 1114 q = p + 1, N + RTMP = V(q,p) + V(q,p) = V(p,q) + V(p,q) = RTMP + 1114 CONTINUE + 1113 CONTINUE + CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). +* + DO 1111 p = 1, N + DO 1112 q = p + 1, N + RTMP = U(q,p) + U(q,p) = U(p,q) + U(p,q) = RTMP + 1112 CONTINUE + 1111 CONTINUE +* + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL SLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL SLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU) + CALL SLASET('A',M-N,N1-N,ZERO,ONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE +* .. copy R**T into [U] and overwrite [U] with the right +* singular vectors of R + DO 1196 p = 1, NR + DO 1197 q = p, N + U(q,NR+p) = A(p,q) + 1197 CONTINUE + 1196 CONTINUE + IF ( NR .GT. 1 ) + $ CALL SLASET('U',NR-1,NR-1,ZERO,ZERO,U(1,NR+2),LDU) + CALL SGEQRF( N, NR, U(1,NR+1), LDU, WORK(N+1), + $ WORK(N+NR+1), LWORK-N-NR, IERR ) + DO 1143 p = 1, NR + DO 1144 q = 1, N + V(q,p) = U(p,NR+q) + 1144 CONTINUE + 1143 CONTINUE + CALL SLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV) + CALL SGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V,LDV, WORK(N+NR+1),LWORK-N-NR, INFO ) + CALL SLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV) + CALL SLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL SLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV) + CALL SORMQR('R','C', N, N, NR, U(1,NR+1), LDU, + $ WORK(N+1),V,LDV,WORK(N+NR+1),LWORK-N-NR,IERR) + CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL SLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF + END IF + END IF +* + ELSE +* +* .. apply SGESVD to R [[this is the recommended option]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R into [V] and overwrite V with the right singular vectors + CALL SLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL SLASET( 'L', NR-1,NR-1, ZERO,ZERO, V(2,1), LDV ) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL SGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL SLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. the requested number of the left singular vectors +* is then N1 (N or M) +* [[The optimal ratio N/NR for using LQ instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'SGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO * NR .GT. N ) THEN + CALL SLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL SLASET('L', NR-1,NR-1, ZERO,ZERO, V(2,1),LDV) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL SLASET('A', N-NR,N, ZERO,ZERO, V(NR+1,1),LDV) + CALL SGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+1), LWORK-N, INFO ) + CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. now [V] contains the transposed matrix of the right +* singular vectors of A. The leading N left singular vectors +* are in [U](1:N,1:N) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL SLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL SLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU) + CALL SLASET( 'A',M-N,N1-N,ZERO,ONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE + CALL SLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU ) + IF ( NR .GT. 1 ) + $ CALL SLASET('L',NR-1,NR-1,ZERO,ZERO,U(NR+2,1),LDU) + CALL SGELQF( NR, N, U(NR+1,1), LDU, WORK(N+1), + $ WORK(N+NR+1), LWORK-N-NR, IERR ) + CALL SLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV) + IF ( NR .GT. 1 ) + $ CALL SLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV) + CALL SGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V, LDV, WORK(N+NR+1), LWORK-N-NR, INFO ) + CALL SLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV) + CALL SLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL SLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV) + CALL SORMLQ('R','N',N,N,NR,U(NR+1,1),LDU,WORK(N+1), + $ V, LDV, WORK(N+NR+1),LWORK-N-NR,IERR) + CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL SLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF + END IF + END IF +* .. end of the "R**T or R" branch + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT. WNTUF ) + $ CALL SORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL SLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* +* ... end of the "full SVD" branch + END IF +* +* Check whether some singular values are returned as zeros, e.g. +* due to underflow, and update the numerical rank. + p = NR + DO 4001 q = p, 1, -1 + IF ( S(q) .GT. ZERO ) GO TO 4002 + NR = NR - 1 + 4001 CONTINUE + 4002 CONTINUE +* +* .. if numerical rank deficiency is detected, the truncated +* singular values are set to zero. + IF ( NR .LT. N ) CALL SLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N ) +* .. undo scaling; this may cause overflow in the largest singular +* values. + IF ( ASCALED ) + $ CALL SLASCL( 'G',0,0, ONE,SQRT(REAL(M)), NR,1, S, N, IERR ) + IF ( CONDA ) RWORK(1) = SCONDA + RWORK(2) = p - NR +* .. p-NR is the number of singular values that are computed as +* exact zeros in SGESVD() applied to the (possibly truncated) +* full row rank triangular (trapezoidal) factor of A. + NUMRANK = NR +* + RETURN +* +* End of SGESVDQ +* + END diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f index 7a7901135..fee5aba4a 100644 --- a/lapack-netlib/SRC/sgesvj.f +++ b/lapack-netlib/SRC/sgesvj.f @@ -90,13 +90,13 @@ *> JOBV is CHARACTER*1 *> Specifies whether to compute the right singular vectors, that *> is, the matrix V: -*> = 'V' : the matrix V is computed and returned in the array V -*> = 'A' : the Jacobi rotations are applied to the MV-by-N +*> = 'V': the matrix V is computed and returned in the array V +*> = 'A': the Jacobi rotations are applied to the MV-by-N *> array V. In other words, the right singular vector *> matrix V is not computed explicitly; instead it is *> applied to an MV-by-N matrix initially stored in the *> first MV rows of V. -*> = 'N' : the matrix V is not computed and the array V is not +*> = 'N': the matrix V is not computed and the array V is not *> referenced *> \endverbatim *> @@ -118,8 +118,8 @@ *> A is REAL array, dimension (LDA,N) *> On entry, the M-by-N matrix A. *> On exit, -*> If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C': -*> If INFO .EQ. 0 : +*> If JOBU = 'U' .OR. JOBU = 'C': +*> If INFO = 0: *> RANKA orthonormal columns of U are returned in the *> leading RANKA columns of the array A. Here RANKA <= N *> is the number of computed singular values of A that are @@ -129,9 +129,9 @@ *> in the array WORK as RANKA=NINT(WORK(2)). Also see the *> descriptions of SVA and WORK. The computed columns of U *> are mutually numerically orthogonal up to approximately -*> TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'), +*> TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'), *> see the description of JOBU. -*> If INFO .GT. 0, +*> If INFO > 0, *> the procedure SGESVJ did not converge in the given number *> of iterations (sweeps). In that case, the computed *> columns of U may not be orthogonal up to TOL. The output @@ -139,8 +139,8 @@ *> values in SVA(1:N)) and V is still a decomposition of the *> input matrix A in the sense that the residual *> ||A-SCALE*U*SIGMA*V^T||_2 / ||A||_2 is small. -*> If JOBU .EQ. 'N': -*> If INFO .EQ. 0 : +*> If JOBU = 'N': +*> If INFO = 0: *> Note that the left singular vectors are 'for free' in the *> one-sided Jacobi SVD algorithm. However, if only the *> singular values are needed, the level of numerical @@ -149,7 +149,7 @@ *> numerically orthogonal up to approximately M*EPS. Thus, *> on exit, A contains the columns of U scaled with the *> corresponding singular values. -*> If INFO .GT. 0 : +*> If INFO > 0: *> the procedure SGESVJ did not converge in the given number *> of iterations (sweeps). *> \endverbatim @@ -164,9 +164,9 @@ *> \verbatim *> SVA is REAL array, dimension (N) *> On exit, -*> If INFO .EQ. 0 : +*> If INFO = 0 : *> depending on the value SCALE = WORK(1), we have: -*> If SCALE .EQ. ONE: +*> If SCALE = ONE: *> SVA(1:N) contains the computed singular values of A. *> During the computation SVA contains the Euclidean column *> norms of the iterated matrices in the array A. @@ -175,7 +175,7 @@ *> factored representation is due to the fact that some of the *> singular values of A might underflow or overflow. *> -*> If INFO .GT. 0 : +*> If INFO > 0 : *> the procedure SGESVJ did not converge in the given number of *> iterations (sweeps) and SCALE*SVA(1:N) may not be accurate. *> \endverbatim @@ -183,7 +183,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then the product of Jacobi rotations in SGESVJ +*> If JOBV = 'A', then the product of Jacobi rotations in SGESVJ *> is applied to the first MV rows of V. See the description of JOBV. *> \endverbatim *> @@ -201,16 +201,16 @@ *> \param[in] LDV *> \verbatim *> LDV is INTEGER -*> The leading dimension of the array V, LDV .GE. 1. -*> If JOBV .EQ. 'V', then LDV .GE. max(1,N). -*> If JOBV .EQ. 'A', then LDV .GE. max(1,MV) . +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V', then LDV >= max(1,N). +*> If JOBV = 'A', then LDV >= max(1,MV) . *> \endverbatim *> *> \param[in,out] WORK *> \verbatim *> WORK is REAL array, dimension (LWORK) *> On entry, -*> If JOBU .EQ. 'C' : +*> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. *> The process stops if all columns of A are mutually *> orthogonal up to CTOL*EPS, EPS=SLAMCH('E'). @@ -230,7 +230,7 @@ *> WORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep. *> This is useful information in cases when SGESVJ did *> not converge, as it can be used to estimate whether -*> the output is stil useful and for post festum analysis. +*> the output is still useful and for post festum analysis. *> WORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. @@ -245,9 +245,9 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value -*> > 0 : SGESVJ did not converge in the maximal allowed number (30) +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value +*> > 0: SGESVJ did not converge in the maximal allowed number (30) *> of sweeps. The output may still be useful. See the *> description of WORK. *> \endverbatim diff --git a/lapack-netlib/SRC/sgesvxx.f b/lapack-netlib/SRC/sgesvxx.f index 281f198d5..7cb29d5ab 100644 --- a/lapack-netlib/SRC/sgesvxx.f +++ b/lapack-netlib/SRC/sgesvxx.f @@ -411,7 +411,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -447,14 +447,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -462,9 +462,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/sgetc2.f b/lapack-netlib/SRC/sgetc2.f index b0301b953..6bf0a93c6 100644 --- a/lapack-netlib/SRC/sgetc2.f +++ b/lapack-netlib/SRC/sgetc2.f @@ -85,7 +85,7 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> > 0: if INFO = k, U(k, k) is likely to produce owerflow if +*> > 0: if INFO = k, U(k, k) is likely to produce overflow if *> we try to solve for x in Ax = b. So U is perturbed to *> avoid the overflow. *> \endverbatim diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index 35af66c19..53d2f9431 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -1,3 +1,5 @@ +*> \brief \b SGETSLS +* * Definition: * =========== * @@ -154,7 +156,7 @@ * *> \date June 2017 * -*> \ingroup doubleGEsolve +*> \ingroup realGEsolve * * ===================================================================== SUBROUTINE SGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, diff --git a/lapack-netlib/SRC/sggesx.f b/lapack-netlib/SRC/sggesx.f index 3c6273dcf..25691d164 100644 --- a/lapack-netlib/SRC/sggesx.f +++ b/lapack-netlib/SRC/sggesx.f @@ -131,10 +131,10 @@ *> \verbatim *> SENSE is CHARACTER*1 *> Determines which reciprocal condition numbers are computed. -*> = 'N' : None are computed; -*> = 'E' : Computed for average of selected eigenvalues only; -*> = 'V' : Computed for selected deflating subspaces only; -*> = 'B' : Computed for both. +*> = 'N': None are computed; +*> = 'E': Computed for average of selected eigenvalues only; +*> = 'V': Computed for selected deflating subspaces only; +*> = 'B': Computed for both. *> If SENSE = 'E', 'V', or 'B', SORT must equal 'S'. *> \endverbatim *> diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f index e580efc30..d9177d818 100644 --- a/lapack-netlib/SRC/sgsvj0.f +++ b/lapack-netlib/SRC/sgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is REAL array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multipled by a *> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim @@ -136,8 +136,8 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -157,7 +157,7 @@ *> TOL is REAL *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -175,14 +175,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: @@ -1045,7 +1045,7 @@ 1993 CONTINUE * end i=1:NSWEEP loop -* #:) Reaching this point means that the procedure has comleted the given +* #:) Reaching this point means that the procedure has completed the given * number of iterations. INFO = NSWEEP - 1 GO TO 1995 diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f index 49b81cf4f..ea4ba2e0e 100644 --- a/lapack-netlib/SRC/sgsvj1.f +++ b/lapack-netlib/SRC/sgsvj1.f @@ -61,7 +61,7 @@ *> In terms of the columns of A, the first N1 columns are rotated 'against' *> the remaining N-N1 columns, trying to increase the angle between the *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is -*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter. +*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter. *> The number of sweeps is given in NSWEEP and the orthogonality threshold *> is given in TOL. *> \endverbatim @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is REAL array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multipled by a *> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim @@ -166,8 +166,8 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -187,7 +187,7 @@ *> TOL is REAL *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -205,14 +205,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f index 5654a4682..b5707f2c3 100644 --- a/lapack-netlib/SRC/shseqr.f +++ b/lapack-netlib/SRC/shseqr.f @@ -70,7 +70,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -87,7 +87,7 @@ *> set by a previous call to SGEBAL, and then passed to ZGEHRD *> when the matrix output by SGEBAL is reduced to Hessenberg *> form. Otherwise ILO and IHI should be set to 1 and N -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -100,20 +100,20 @@ *> (the Schur form); 2-by-2 diagonal blocks (corresponding to *> complex conjugate pairs of eigenvalues) are returned in *> standard form, with H(i,i) = H(i+1,i+1) and -*> H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the +*> H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and JOB = 'E', the *> contents of H are unspecified on exit. (The output value of -*> H when INFO.GT.0 is given under the description of INFO +*> H when INFO > 0 is given under the description of INFO *> below.) *> *> Unlike earlier versions of SHSEQR, this subroutine may -*> explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1 +*> explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1 *> or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] WR @@ -128,8 +128,8 @@ *> The real and imaginary parts, respectively, of the computed *> eigenvalues. If two eigenvalues are computed as a complex *> conjugate pair, they are stored in consecutive elements of -*> WR and WI, say the i-th and (i+1)th, with WI(i) .GT. 0 and -*> WI(i+1) .LT. 0. If JOB = 'S', the eigenvalues are stored in +*> WR and WI, say the i-th and (i+1)th, with WI(i) > 0 and +*> WI(i+1) < 0. If JOB = 'S', the eigenvalues are stored in *> the same order as on the diagonal of the Schur form returned *> in H, with WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 *> diagonal block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and @@ -148,7 +148,7 @@ *> if INFO = 0, Z contains Q*Z. *> Normally Q is the orthogonal matrix generated by SORGHR *> after the call to SGEHRD which formed the Hessenberg matrix -*> H. (The output value of Z when INFO.GT.0 is given under +*> H. (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -156,7 +156,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if COMPZ = 'I' or -*> COMPZ = 'V', then LDZ.GE.MAX(1,N). Otherwize, LDZ.GE.1. +*> COMPZ = 'V', then LDZ >= MAX(1,N). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -169,7 +169,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient and delivers very good and sometimes *> optimal performance. However, LWORK as large as 11*N *> may be required for optimal performance. A workspace @@ -187,21 +187,21 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .LT. 0: if INFO = -i, the i-th argument had an illegal +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal *> value -*> .GT. 0: if INFO = i, SHSEQR failed to compute all of +*> > 0: if INFO = i, SHSEQR failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and JOB = 'E', then on exit, the +*> If INFO > 0 and JOB = 'E', then on exit, the *> remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and JOB = 'S', then on exit +*> If INFO > 0 and JOB = 'S', then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -209,19 +209,19 @@ *> value of H is upper Hessenberg and quasi-triangular *> in rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and COMPZ = 'V', then on exit +*> If INFO > 0 and COMPZ = 'V', then on exit *> *> (final value of Z) = (initial value of Z)*U *> *> where U is the orthogonal matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'I', then on exit +*> If INFO > 0 and COMPZ = 'I', then on exit *> (final value of Z) = U *> where U is the orthogonal matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'N', then Z is not +*> If INFO > 0 and COMPZ = 'N', then Z is not *> accessed. *> \endverbatim * @@ -261,8 +261,8 @@ *> This depends on ILO, IHI and NS. NS is the *> number of simultaneous shifts returned *> by ILAENV(ISPEC=15). (See ISPEC=15 below.) -*> The default for (IHI-ILO+1).LE.500 is NS. -*> The default for (IHI-ILO+1).GT.500 is 3*NS/2. +*> The default for (IHI-ILO+1) <= 500 is NS. +*> The default for (IHI-ILO+1) > 500 is 3*NS/2. *> *> ISPEC=14: Nibble crossover point. (See IPARMQ for *> details.) Default: 14% of deflation window @@ -341,8 +341,8 @@ PARAMETER ( NTINY = 11 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare SLAHQR failure. NL .GT. NTINY = 11 is -* . required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom- +* . through a rare SLAHQR failure. NL > NTINY = 11 is +* . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 * . deflation window. ==== diff --git a/lapack-netlib/SRC/sla_gbrcond.f b/lapack-netlib/SRC/sla_gbrcond.f index 36aa93dc9..7f2c4062e 100644 --- a/lapack-netlib/SRC/sla_gbrcond.f +++ b/lapack-netlib/SRC/sla_gbrcond.f @@ -140,13 +140,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (5*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.f b/lapack-netlib/SRC/sla_gbrfsx_extended.f index a81feb45e..0fd1fd350 100644 --- a/lapack-netlib/SRC/sla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/sla_gbrfsx_extended.f @@ -65,19 +65,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -269,7 +269,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/sla_gercond.f b/lapack-netlib/SRC/sla_gercond.f index 349a1b5be..e54e0d7b4 100644 --- a/lapack-netlib/SRC/sla_gercond.f +++ b/lapack-netlib/SRC/sla_gercond.f @@ -122,13 +122,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (3*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace.2 diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.f b/lapack-netlib/SRC/sla_gerfsx_extended.f index 1795ea975..84d1ae31b 100644 --- a/lapack-netlib/SRC/sla_gerfsx_extended.f +++ b/lapack-netlib/SRC/sla_gerfsx_extended.f @@ -65,19 +65,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -257,7 +257,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERRS_C is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERRS_C is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERRS_C(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/sla_porcond.f b/lapack-netlib/SRC/sla_porcond.f index 9dd7c587b..729581f46 100644 --- a/lapack-netlib/SRC/sla_porcond.f +++ b/lapack-netlib/SRC/sla_porcond.f @@ -112,13 +112,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (3*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/sla_porfsx_extended.f b/lapack-netlib/SRC/sla_porfsx_extended.f index 27baa20d1..abbfebb83 100644 --- a/lapack-netlib/SRC/sla_porfsx_extended.f +++ b/lapack-netlib/SRC/sla_porfsx_extended.f @@ -65,11 +65,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -246,7 +246,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/sla_syrcond.f b/lapack-netlib/SRC/sla_syrcond.f index c4b204cc6..0c9e2b361 100644 --- a/lapack-netlib/SRC/sla_syrcond.f +++ b/lapack-netlib/SRC/sla_syrcond.f @@ -118,13 +118,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (3*N). *> Workspace. *> \endverbatim *> -*> \param[in] IWORK +*> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.f b/lapack-netlib/SRC/sla_syrfsx_extended.f index f7b909ac0..a83a9db98 100644 --- a/lapack-netlib/SRC/sla_syrfsx_extended.f +++ b/lapack-netlib/SRC/sla_syrfsx_extended.f @@ -67,11 +67,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -255,7 +255,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/sla_syrpvgrw.f b/lapack-netlib/SRC/sla_syrpvgrw.f index f5eb81b1f..a0a235ee3 100644 --- a/lapack-netlib/SRC/sla_syrpvgrw.f +++ b/lapack-netlib/SRC/sla_syrpvgrw.f @@ -101,7 +101,7 @@ *> as determined by SSYTRF. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/sla_wwaddw.f b/lapack-netlib/SRC/sla_wwaddw.f index 96a7d3542..e390c9fab 100644 --- a/lapack-netlib/SRC/sla_wwaddw.f +++ b/lapack-netlib/SRC/sla_wwaddw.f @@ -36,7 +36,7 @@ *> SLA_WWADDW adds a vector W into a doubled-single vector (X, Y). *> *> This works for all extant IBM's hex and binary floating point -*> arithmetics, but not for decimal. +*> arithmetic, but not for decimal. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/slaed4.f b/lapack-netlib/SRC/slaed4.f index c65cba75a..64260843f 100644 --- a/lapack-netlib/SRC/slaed4.f +++ b/lapack-netlib/SRC/slaed4.f @@ -82,7 +82,7 @@ *> \param[out] DELTA *> \verbatim *> DELTA is REAL array, dimension (N) -*> If N .GT. 2, DELTA contains (D(j) - lambda_I) in its j-th +*> If N > 2, DELTA contains (D(j) - lambda_I) in its j-th *> component. If N = 1, then DELTA(1) = 1. If N = 2, see SLAED5 *> for detail. The vector DELTA contains the information necessary *> to construct the eigenvectors by SLAED3 and SLAED9. diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f index 5ec117cb5..2e3f6f51f 100644 --- a/lapack-netlib/SRC/slaed8.f +++ b/lapack-netlib/SRC/slaed8.f @@ -353,7 +353,7 @@ Z( I ) = W( INDX( I ) ) 40 CONTINUE * -* Calculate the allowable deflation tolerence +* Calculate the allowable deflation tolerance * IMAX = ISAMAX( N, Z, 1 ) JMAX = ISAMAX( N, D, 1 ) diff --git a/lapack-netlib/SRC/slagtf.f b/lapack-netlib/SRC/slagtf.f index d3f0b6813..59ef097a7 100644 --- a/lapack-netlib/SRC/slagtf.f +++ b/lapack-netlib/SRC/slagtf.f @@ -125,7 +125,7 @@ *> then IN(k) = 1, otherwise IN(k) = 0. The element IN(n) *> returns the smallest positive integer j such that *> -*> abs( u(j,j) ).le. norm( (T - lambda*I)(j) )*TOL, +*> abs( u(j,j) ) <= norm( (T - lambda*I)(j) )*TOL, *> *> where norm( A(j) ) denotes the sum of the absolute values of *> the jth row of the matrix A. If no such j exists then IN(n) @@ -137,8 +137,8 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit -*> .lt. 0: if INFO = -k, the kth argument had an illegal value +*> = 0: successful exit +*> < 0: if INFO = -k, the kth argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slagts.f b/lapack-netlib/SRC/slagts.f index 0c3c5239f..e0c8892d7 100644 --- a/lapack-netlib/SRC/slagts.f +++ b/lapack-netlib/SRC/slagts.f @@ -122,12 +122,12 @@ *> \param[in,out] TOL *> \verbatim *> TOL is REAL -*> On entry, with JOB .lt. 0, TOL should be the minimum +*> On entry, with JOB < 0, TOL should be the minimum *> perturbation to be made to very small diagonal elements of U. *> TOL should normally be chosen as about eps*norm(U), where eps *> is the relative machine precision, but if TOL is supplied as *> non-positive, then it is reset to eps*max( abs( u(i,j) ) ). -*> If JOB .gt. 0 then TOL is not referenced. +*> If JOB > 0 then TOL is not referenced. *> *> On exit, TOL is changed as described above, only if TOL is *> non-positive on entry. Otherwise TOL is unchanged. @@ -136,14 +136,14 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit -*> .lt. 0: if INFO = -i, the i-th argument had an illegal value -*> .gt. 0: overflow would occur when computing the INFO(th) -*> element of the solution vector x. This can only occur -*> when JOB is supplied as positive and either means -*> that a diagonal element of U is very small, or that -*> the elements of the right-hand side vector y are very -*> large. +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: overflow would occur when computing the INFO(th) +*> element of the solution vector x. This can only occur +*> when JOB is supplied as positive and either means +*> that a diagonal element of U is very small, or that +*> the elements of the right-hand side vector y are very +*> large. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slahqr.f b/lapack-netlib/SRC/slahqr.f index d91826e61..e5642d2bf 100644 --- a/lapack-netlib/SRC/slahqr.f +++ b/lapack-netlib/SRC/slahqr.f @@ -150,26 +150,26 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: If INFO = i, SLAHQR failed to compute all the +*> = 0: successful exit +*> > 0: If INFO = i, SLAHQR failed to compute all the *> eigenvalues ILO to IHI in a total of 30 iterations *> per eigenvalue; elements i+1:ihi of WR and WI *> contain those eigenvalues which have been *> successfully computed. *> -*> If INFO .GT. 0 and WANTT is .FALSE., then on exit, +*> If INFO > 0 and WANTT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the *> eigenvalues of the upper Hessenberg matrix rows -*> and columns ILO thorugh INFO of the final, output +*> and columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> (*) (initial value of H)*U = U*(final value of H) -*> where U is an orthognal matrix. The final +*> where U is an orthogonal matrix. The final *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> (final value of Z) = (initial value of Z)*U *> where U is the orthogonal matrix in (*) *> (regardless of the value of WANTT.) diff --git a/lapack-netlib/SRC/slaln2.f b/lapack-netlib/SRC/slaln2.f index f9ceee7b7..4c6a55ec7 100644 --- a/lapack-netlib/SRC/slaln2.f +++ b/lapack-netlib/SRC/slaln2.f @@ -49,7 +49,7 @@ *> the first column of each being the real part and the second *> being the imaginary part. *> -*> "s" is a scaling factor (.LE. 1), computed by SLALN2, which is +*> "s" is a scaling factor (<= 1), computed by SLALN2, which is *> so chosen that X can be computed without overflow. X is further *> scaled if necessary to assure that norm(ca A - w D)*norm(X) is less *> than overflow. diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f index b13d02b6c..59ab1a6ee 100644 --- a/lapack-netlib/SRC/slamswlq.f +++ b/lapack-netlib/SRC/slamswlq.f @@ -1,3 +1,4 @@ +*> \brief \b SLAMSWLQ * * Definition: * =========== diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f index 84ac86ee2..58905ab46 100644 --- a/lapack-netlib/SRC/slamtsqr.f +++ b/lapack-netlib/SRC/slamtsqr.f @@ -1,3 +1,4 @@ +*> \brief \b SLAMTSQR * * Definition: * =========== diff --git a/lapack-netlib/SRC/slangb.f b/lapack-netlib/SRC/slangb.f index fd538b1b7..706e07501 100644 --- a/lapack-netlib/SRC/slangb.f +++ b/lapack-netlib/SRC/slangb.f @@ -129,6 +129,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER KL, KU, LDAB, N @@ -139,22 +140,24 @@ * * ===================================================================== * -* * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER I, J, K, L - REAL SCALE, SUM, VALUE, TEMP + REAL SUM, VALUE, TEMP * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT * .. @@ -206,15 +209,22 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N L = MAX( 1, J-KU ) K = KU + 1 - J + L - CALL SLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANGB = VALUE diff --git a/lapack-netlib/SRC/slange.f b/lapack-netlib/SRC/slange.f index 2eb8d7d14..0c80f1d40 100644 --- a/lapack-netlib/SRC/slange.f +++ b/lapack-netlib/SRC/slange.f @@ -119,6 +119,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, M, N @@ -135,10 +136,13 @@ * .. * .. Local Scalars .. INTEGER I, J - REAL SCALE, SUM, VALUE, TEMP + REAL SUM, VALUE, TEMP +* .. +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Subroutines .. - EXTERNAL SLASSQ + EXTERNAL SLASSQ, SCOMBSSQ * .. * .. External Functions .. LOGICAL LSAME, SISNAN @@ -194,13 +198,19 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL SLASSQ( M, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANGE = VALUE diff --git a/lapack-netlib/SRC/slanhs.f b/lapack-netlib/SRC/slanhs.f index c5a077fbf..8913031a2 100644 --- a/lapack-netlib/SRC/slanhs.f +++ b/lapack-netlib/SRC/slanhs.f @@ -113,6 +113,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, N @@ -129,15 +130,18 @@ * .. * .. Local Scalars .. INTEGER I, J - REAL SCALE, SUM, VALUE + REAL SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT * .. @@ -188,13 +192,20 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL SLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( MIN( N, J+1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANHS = VALUE diff --git a/lapack-netlib/SRC/slansb.f b/lapack-netlib/SRC/slansb.f index 8f3fe1eb9..23519025d 100644 --- a/lapack-netlib/SRC/slansb.f +++ b/lapack-netlib/SRC/slansb.f @@ -134,6 +134,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER K, LDAB, N @@ -150,15 +151,18 @@ * .. * .. Local Scalars .. INTEGER I, J, L - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT * .. @@ -225,29 +229,47 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( K.GT.0 ) THEN IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL SLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE L = K + 1 ELSE DO 120 J = 1, N - 1 - CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE L = 1 END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) ELSE L = 1 END IF - CALL SLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANSB = VALUE diff --git a/lapack-netlib/SRC/slansp.f b/lapack-netlib/SRC/slansp.f index 35390cd1c..7e29d778b 100644 --- a/lapack-netlib/SRC/slansp.f +++ b/lapack-netlib/SRC/slansp.f @@ -119,6 +119,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER N @@ -135,15 +136,18 @@ * .. * .. Local Scalars .. INTEGER I, J, K - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. @@ -217,31 +221,48 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE K = 2 IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL SLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + J 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL SLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* K = 1 + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 I = 1, N IF( AP( K ).NE.ZERO ) THEN ABSA = ABS( AP( K ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( LSAME( UPLO, 'U' ) ) THEN @@ -250,7 +271,8 @@ K = K + N - I + 1 END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANSP = VALUE diff --git a/lapack-netlib/SRC/slansy.f b/lapack-netlib/SRC/slansy.f index c8400e530..66ff1c5c7 100644 --- a/lapack-netlib/SRC/slansy.f +++ b/lapack-netlib/SRC/slansy.f @@ -127,6 +127,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER LDA, N @@ -143,15 +144,18 @@ * .. * .. Local Scalars .. INTEGER I, J - REAL ABSA, SCALE, SUM, VALUE + REAL ABSA, SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. @@ -216,21 +220,39 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL SLASSQ( J-1, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL SLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE END IF - SUM = 2*SUM - CALL SLASSQ( N, A, LDA+1, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANSY = VALUE diff --git a/lapack-netlib/SRC/slantb.f b/lapack-netlib/SRC/slantb.f index 3588779cb..5b94618e1 100644 --- a/lapack-netlib/SRC/slantb.f +++ b/lapack-netlib/SRC/slantb.f @@ -145,6 +145,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER K, LDAB, N @@ -162,15 +163,18 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, L - REAL SCALE, SUM, VALUE + REAL SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT * .. @@ -311,46 +315,61 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 280 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL SLASSQ( MIN( J-1, K ), - $ AB( MAX( K+2-J, 1 ), J ), 1, SCALE, - $ SUM ) + $ AB( MAX( K+2-J, 1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 280 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 290 J = 1, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL SLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 300 J = 1, N - 1 - CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 310 J = 1, N - CALL SLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANTB = VALUE diff --git a/lapack-netlib/SRC/slantp.f b/lapack-netlib/SRC/slantp.f index 1423f5ca3..ab781deac 100644 --- a/lapack-netlib/SRC/slantp.f +++ b/lapack-netlib/SRC/slantp.f @@ -129,6 +129,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER N @@ -146,15 +147,18 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, K - REAL SCALE, SUM, VALUE + REAL SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. @@ -306,45 +310,64 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 280 J = 2, N - CALL SLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( J-1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + J 280 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 290 J = 1, N - CALL SLASSQ( J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + J 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 300 J = 1, N - 1 - CALL SLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( N-J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 300 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 310 J = 1, N - CALL SLASSQ( N-J+1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( N-J+1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANTP = VALUE diff --git a/lapack-netlib/SRC/slantr.f b/lapack-netlib/SRC/slantr.f index 63b855892..04d29f537 100644 --- a/lapack-netlib/SRC/slantr.f +++ b/lapack-netlib/SRC/slantr.f @@ -146,6 +146,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER LDA, M, N @@ -163,15 +164,18 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J - REAL SCALE, SUM, VALUE + REAL SUM, VALUE * .. -* .. External Subroutines .. - EXTERNAL SLASSQ +* .. Local Arrays .. + REAL SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, SISNAN EXTERNAL LSAME, SISNAN * .. +* .. External Subroutines .. + EXTERNAL SLASSQ, SCOMBSSQ +* .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT * .. @@ -281,7 +285,7 @@ END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - DO 210 I = 1, N + DO 210 I = 1, MIN( M, N ) WORK( I ) = ONE 210 CONTINUE DO 220 I = N + 1, M @@ -311,38 +315,56 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 290 J = 2, N - CALL SLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( MIN( M, J-1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 300 J = 1, N - CALL SLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( MIN( M, J ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 310 J = 1, N - CALL SLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 320 J = 1, N - CALL SLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL SLASSQ( M-J+1, A( J, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL SCOMBSSQ( SSQ, COLSSQ ) 320 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * SLANTR = VALUE diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index e73e5455c..1163446fa 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -161,7 +161,6 @@ IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO - GO TO 10 * ELSE IF( B.EQ.ZERO ) THEN * @@ -174,12 +173,12 @@ A = TEMP B = -C C = ZERO - GO TO 10 +* ELSE IF( (A-D).EQ.ZERO .AND. SIGN( ONE, B ).NE. $ SIGN( ONE, C ) ) THEN CS = ONE SN = ZERO - GO TO 10 +* ELSE * TEMP = A - D @@ -207,6 +206,7 @@ SN = C / TAU B = B - C C = ZERO +* ELSE * * Complex eigenvalues, or real (almost) equal eigenvalues. @@ -268,8 +268,6 @@ END IF * END IF -* - 10 CONTINUE * * Store eigenvalues in (RT1R,RT1I) and (RT2R,RT2I). * diff --git a/lapack-netlib/SRC/slaorhr_col_getrfnp.f b/lapack-netlib/SRC/slaorhr_col_getrfnp.f new file mode 100644 index 000000000..6cc59e538 --- /dev/null +++ b/lapack-netlib/SRC/slaorhr_col_getrfnp.f @@ -0,0 +1,248 @@ +*> \brief \b SLAORHR_COL_GETRFNP +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLAORHR_COL_GETRFNP + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAORHR_COL_GETRFNP computes the modified LU factorization without +*> pivoting of a real general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is +*> at least one in absolute value (so that division-by-zero not +*> not possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine SORHR_COL. In SORHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the blocked right-looking version of the algorithm, +*> calling Level 3 BLAS to update the submatrix. To factorize a block, +*> this routine calls the recursive routine SLAORHR_COL_GETRFNP2. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is REAL array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can +*> be only plus or minus one. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup realGEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLAORHR_COL_GETRFNP2, STRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLAORHR_COL_GETRFNP', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + + NB = ILAENV( 1, 'SLAORHR_COL_GETRFNP', ' ', M, N, -1, -1 ) + + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL SLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + ELSE +* +* Use blocked code. +* + DO J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks. +* + CALL SLAORHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA, + $ D( J ), IINFO ) +* + IF( J+JB.LE.N ) THEN +* +* Compute block row of U. +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL SGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + END DO + END IF + RETURN +* +* End of SLAORHR_COL_GETRFNP +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/slaorhr_col_getrfnp2.f b/lapack-netlib/SRC/slaorhr_col_getrfnp2.f new file mode 100644 index 000000000..de604602f --- /dev/null +++ b/lapack-netlib/SRC/slaorhr_col_getrfnp2.f @@ -0,0 +1,305 @@ +*> \brief \b SLAORHR_COL_GETRFNP2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAORHR_GETRF2NP + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE SLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAORHR_COL_GETRFNP2 computes the modified LU factorization without +*> pivoting of a real general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is at +*> least one in absolute value (so that division-by-zero not +*> possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine SORHR_COL. In SORHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the recursive version of the LU factorization algorithm. +*> Denote A - S by B. The algorithm divides the matrix B into four +*> submatrices: +*> +*> [ B11 | B12 ] where B11 is n1 by n1, +*> B = [ -----|----- ] B21 is (m-n1) by n1, +*> [ B21 | B22 ] B12 is n1 by n2, +*> B22 is (m-n1) by n2, +*> with n1 = min(m,n)/2, n2 = n-n1. +*> +*> +*> The subroutine calls itself to factor B11, solves for B21, +*> solves for B12, updates B22, then calls itself to factor B22. +*> +*> For more details on the recursive LU algorithm, see [2]. +*> +*> SLAORHR_COL_GETRFNP2 is called to factorize a block by the blocked +*> routine SLAORHR_COL_GETRFNP, which uses blocked code calling +*. Level 3 BLAS to update the submatrix. However, SLAORHR_COL_GETRFNP2 +*> is self-sufficient and can be used without SLAORHR_COL_GETRFNP. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> +*> [2] "Recursion leads to automatic variable blocking for dense linear +*> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev., +*> vol. 41, no. 6, pp. 737-755, 1997. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is REAL array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can +*> be only plus or minus one. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup realGEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + RECURSIVE SUBROUTINE SLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + REAL SFMIN + INTEGER I, IINFO, N1, N2 +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SSCAL, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, SIGN, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLAORHR_COL_GETRFNP2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN + + IF ( M.EQ.1 ) THEN +* +* One row case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = -SIGN( ONE, A( 1, 1 ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* + ELSE IF( N.EQ.1 ) THEN +* +* One column case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = -SIGN( ONE, A( 1, 1 ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* +* Scale the elements 2:M of the column +* +* Determine machine safe minimum +* + SFMIN = SLAMCH('S') +* +* Construct the subdiagonal elements of L +* + IF( ABS( A( 1, 1 ) ) .GE. SFMIN ) THEN + CALL SSCAL( M-1, ONE / A( 1, 1 ), A( 2, 1 ), 1 ) + ELSE + DO I = 2, M + A( I, 1 ) = A( I, 1 ) / A( 1, 1 ) + END DO + END IF +* + ELSE +* +* Divide the matrix B into four submatrices +* + N1 = MIN( M, N ) / 2 + N2 = N-N1 + +* +* Factor B11, recursive call +* + CALL SLAORHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO ) +* +* Solve for B21 +* + CALL STRSM( 'R', 'U', 'N', 'N', M-N1, N1, ONE, A, LDA, + $ A( N1+1, 1 ), LDA ) +* +* Solve for B12 +* + CALL STRSM( 'L', 'L', 'N', 'U', N1, N2, ONE, A, LDA, + $ A( 1, N1+1 ), LDA ) +* +* Update B22, i.e. compute the Schur complement +* B22 := B22 - B21*B12 +* + CALL SGEMM( 'N', 'N', M-N1, N2, N1, -ONE, A( N1+1, 1 ), LDA, + $ A( 1, N1+1 ), LDA, ONE, A( N1+1, N1+1 ), LDA ) +* +* Factor B22, recursive call +* + CALL SLAORHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA, + $ D( N1+1 ), IINFO ) +* + END IF + RETURN +* +* End of SLAORHR_COL_GETRFNP2 +* + END diff --git a/lapack-netlib/SRC/slaqps.f b/lapack-netlib/SRC/slaqps.f index 9c62ec8b6..3f8af304f 100644 --- a/lapack-netlib/SRC/slaqps.f +++ b/lapack-netlib/SRC/slaqps.f @@ -127,7 +127,7 @@ *> \param[in,out] AUXV *> \verbatim *> AUXV is REAL array, dimension (NB) -*> Auxiliar vector. +*> Auxiliary vector. *> \endverbatim *> *> \param[in,out] F diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f index 1dcd3d176..318b46943 100644 --- a/lapack-netlib/SRC/slaqr0.f +++ b/lapack-netlib/SRC/slaqr0.f @@ -67,7 +67,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -79,12 +79,12 @@ *> \verbatim *> IHI is INTEGER *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to SGEBAL, and then passed to SGEHRD when the *> matrix output by SGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -97,19 +97,19 @@ *> decomposition (the Schur form); 2-by-2 diagonal blocks *> (corresponding to complex conjugate pairs of eigenvalues) *> are returned in standard form, with H(i,i) = H(i+1,i+1) -*> and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is +*> and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] WR @@ -125,7 +125,7 @@ *> and WI(ILO:IHI). If two eigenvalues are computed as a *> complex conjugate pair, they are stored in consecutive *> elements of WR and WI, say the i-th and (i+1)th, with -*> WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then +*> WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then *> the eigenvalues are stored in the same order as on the *> diagonal of the Schur form returned in H, with *> WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal @@ -143,7 +143,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -153,7 +153,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -161,7 +161,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -174,7 +174,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -190,19 +190,19 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, SLAQR0 failed to compute all of +*> = 0: successful exit +*> > 0: if INFO = i, SLAQR0 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -210,7 +210,7 @@ *> value of H is upper Hessenberg and quasi-triangular *> in rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -218,7 +218,7 @@ *> where U is the orthogonal matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -677,7 +677,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/slaqr1.f b/lapack-netlib/SRC/slaqr1.f index 2de33849d..6bb88c794 100644 --- a/lapack-netlib/SRC/slaqr1.f +++ b/lapack-netlib/SRC/slaqr1.f @@ -69,7 +69,7 @@ *> \verbatim *> LDH is INTEGER *> The leading dimension of H as declared in -*> the calling procedure. LDH.GE.N +*> the calling procedure. LDH >= N *> \endverbatim *> *> \param[in] SR1 diff --git a/lapack-netlib/SRC/slaqr2.f b/lapack-netlib/SRC/slaqr2.f index 8e1f34910..f4f8ca7f2 100644 --- a/lapack-netlib/SRC/slaqr2.f +++ b/lapack-netlib/SRC/slaqr2.f @@ -103,7 +103,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -121,7 +121,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -133,7 +133,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -149,7 +149,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -194,13 +194,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -212,14 +212,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -231,7 +231,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/slaqr3.f b/lapack-netlib/SRC/slaqr3.f index 534e2c489..ccad338b9 100644 --- a/lapack-netlib/SRC/slaqr3.f +++ b/lapack-netlib/SRC/slaqr3.f @@ -100,7 +100,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -118,7 +118,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -130,7 +130,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -146,7 +146,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -191,13 +191,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -209,14 +209,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -228,7 +228,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f index 12b6b2fb1..cd642e07f 100644 --- a/lapack-netlib/SRC/slaqr4.f +++ b/lapack-netlib/SRC/slaqr4.f @@ -74,7 +74,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -86,12 +86,12 @@ *> \verbatim *> IHI is INTEGER *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to SGEBAL, and then passed to SGEHRD when the *> matrix output by SGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -104,19 +104,19 @@ *> decomposition (the Schur form); 2-by-2 diagonal blocks *> (corresponding to complex conjugate pairs of eigenvalues) *> are returned in standard form, with H(i,i) = H(i+1,i+1) -*> and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is +*> and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] WR @@ -132,7 +132,7 @@ *> and WI(ILO:IHI). If two eigenvalues are computed as a *> complex conjugate pair, they are stored in consecutive *> elements of WR and WI, say the i-th and (i+1)th, with -*> WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then +*> WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then *> the eigenvalues are stored in the same order as on the *> diagonal of the Schur form returned in H, with *> WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal @@ -150,7 +150,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -160,7 +160,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -168,7 +168,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -181,7 +181,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -199,19 +199,19 @@ *> INFO is INTEGER *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, SLAQR4 failed to compute all of +*> = 0: successful exit +*> > 0: if INFO = i, SLAQR4 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -219,7 +219,7 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -227,7 +227,7 @@ *> where U is the orthogonal matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -680,7 +680,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index 65278e355..f04ee577e 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -133,7 +133,7 @@ *> \verbatim *> LDH is INTEGER *> LDH is the leading dimension of H just as declared in the -*> calling procedure. LDH.GE.MAX(1,N). +*> calling procedure. LDH >= MAX(1,N). *> \endverbatim *> *> \param[in] ILOZ @@ -145,7 +145,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N *> \endverbatim *> *> \param[in,out] Z @@ -161,7 +161,7 @@ *> \verbatim *> LDZ is INTEGER *> LDA is the leading dimension of Z just as declared in -*> the calling procedure. LDZ.GE.N. +*> the calling procedure. LDZ >= N. *> \endverbatim *> *> \param[out] V @@ -173,7 +173,7 @@ *> \verbatim *> LDV is INTEGER *> LDV is the leading dimension of V as declared in the -*> calling procedure. LDV.GE.3. +*> calling procedure. LDV >= 3. *> \endverbatim *> *> \param[out] U @@ -185,33 +185,14 @@ *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU.GE.3*NSHFTS-3. -*> \endverbatim -*> -*> \param[in] NH -*> \verbatim -*> NH is INTEGER -*> NH is the number of columns in array WH available for -*> workspace. NH.GE.1. -*> \endverbatim -*> -*> \param[out] WH -*> \verbatim -*> WH is REAL array, dimension (LDWH,NH) -*> \endverbatim -*> -*> \param[in] LDWH -*> \verbatim -*> LDWH is INTEGER -*> Leading dimension of WH just as declared in the -*> calling procedure. LDWH.GE.3*NSHFTS-3. +*> in the calling subroutine. LDU >= 3*NSHFTS-3. *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> NV is the number of rows in WV agailable for workspace. -*> NV.GE.1. +*> NV >= 1. *> \endverbatim *> *> \param[out] WV @@ -223,9 +204,28 @@ *> \verbatim *> LDWV is INTEGER *> LDWV is the leading dimension of WV as declared in the -*> in the calling subroutine. LDWV.GE.NV. +*> in the calling subroutine. LDWV >= NV. *> \endverbatim * +*> \param[in] NH +*> \verbatim +*> NH is INTEGER +*> NH is the number of columns in array WH available for +*> workspace. NH >= 1. +*> \endverbatim +*> +*> \param[out] WH +*> \verbatim +*> WH is REAL array, dimension (LDWH,NH) +*> \endverbatim +*> +*> \param[in] LDWH +*> \verbatim +*> LDWH is INTEGER +*> Leading dimension of WH just as declared in the +*> calling procedure. LDWH >= 3*NSHFTS-3. +*> \endverbatim +*> * Authors: * ======== * diff --git a/lapack-netlib/SRC/slarfb.f b/lapack-netlib/SRC/slarfb.f index c51f69534..d853a54ec 100644 --- a/lapack-netlib/SRC/slarfb.f +++ b/lapack-netlib/SRC/slarfb.f @@ -92,6 +92,8 @@ *> K is INTEGER *> The order of the matrix T (= the number of elementary *> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. *> \endverbatim *> *> \param[in] V diff --git a/lapack-netlib/SRC/slarfx.f b/lapack-netlib/SRC/slarfx.f index 590e99e70..3175068b8 100644 --- a/lapack-netlib/SRC/slarfx.f +++ b/lapack-netlib/SRC/slarfx.f @@ -94,7 +94,7 @@ *> \param[in] LDC *> \verbatim *> LDC is INTEGER -*> The leading dimension of the array C. LDA >= (1,M). +*> The leading dimension of the array C. LDC >= (1,M). *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/slarfy.f b/lapack-netlib/SRC/slarfy.f index 340c54413..f9ba011a2 100644 --- a/lapack-netlib/SRC/slarfy.f +++ b/lapack-netlib/SRC/slarfy.f @@ -103,7 +103,7 @@ * *> \date December 2016 * -*> \ingroup single_eig +*> \ingroup realOTHERauxiliary * * ===================================================================== SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) diff --git a/lapack-netlib/SRC/slarrb.f b/lapack-netlib/SRC/slarrb.f index 988e25ff0..ac9d7bc8c 100644 --- a/lapack-netlib/SRC/slarrb.f +++ b/lapack-netlib/SRC/slarrb.f @@ -91,7 +91,7 @@ *> RTOL2 is REAL *> Tolerance for the convergence of the bisection intervals. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> where GAP is the (estimated) distance to the nearest *> eigenvalue. *> \endverbatim @@ -117,7 +117,7 @@ *> WGAP is REAL array, dimension (N-1) *> On input, the (estimated) gaps between consecutive *> eigenvalues of L D L^T, i.e., WGAP(I-OFFSET) is the gap between -*> eigenvalues I and I+1. Note that if IFIRST.EQ.ILAST +*> eigenvalues I and I+1. Note that if IFIRST = ILAST *> then WGAP(IFIRST-OFFSET) must be set to ZERO. *> On output, these gaps are refined. *> \endverbatim diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f index ea9b8fcbc..6636235d0 100644 --- a/lapack-netlib/SRC/slarre.f +++ b/lapack-netlib/SRC/slarre.f @@ -150,7 +150,7 @@ *> RTOL2 is REAL *> Parameters for bisection. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> \endverbatim *> *> \param[in] SPLTOL diff --git a/lapack-netlib/SRC/slarrj.f b/lapack-netlib/SRC/slarrj.f index a721d0751..fb867595c 100644 --- a/lapack-netlib/SRC/slarrj.f +++ b/lapack-netlib/SRC/slarrj.f @@ -85,7 +85,7 @@ *> RTOL is REAL *> Tolerance for the convergence of the bisection intervals. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.RTOL*MAX(|LEFT|,|RIGHT|). +*> RIGHT-LEFT < RTOL*MAX(|LEFT|,|RIGHT|). *> \endverbatim *> *> \param[in] OFFSET diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f index f9e3cf2b9..04519fde8 100644 --- a/lapack-netlib/SRC/slarrv.f +++ b/lapack-netlib/SRC/slarrv.f @@ -149,7 +149,7 @@ *> RTOL2 is REAL *> Parameters for bisection. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> \endverbatim *> *> \param[in,out] W diff --git a/lapack-netlib/SRC/slasd7.f b/lapack-netlib/SRC/slasd7.f index 2adaa5ee7..d8775b6c6 100644 --- a/lapack-netlib/SRC/slasd7.f +++ b/lapack-netlib/SRC/slasd7.f @@ -400,7 +400,7 @@ VL( I ) = VLW( IDXI ) 50 CONTINUE * -* Calculate the allowable deflation tolerence +* Calculate the allowable deflation tolerance * EPS = SLAMCH( 'Epsilon' ) TOL = MAX( ABS( ALPHA ), ABS( BETA ) ) diff --git a/lapack-netlib/SRC/slassq.f b/lapack-netlib/SRC/slassq.f index 35b40f07f..d9930a597 100644 --- a/lapack-netlib/SRC/slassq.f +++ b/lapack-netlib/SRC/slassq.f @@ -60,7 +60,7 @@ *> *> \param[in] X *> \verbatim -*> X is REAL array, dimension (N) +*> X is REAL array, dimension (1+(N-1)*INCX) *> The vector for which a scaled sum of squares is computed. *> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. *> \endverbatim diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index 27b5b8067..5eb9cc5f9 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -1,3 +1,4 @@ +*> \brief \b SLASWLQ * * Definition: * =========== @@ -18,9 +19,20 @@ *> *> \verbatim *> -*> SLASWLQ computes a blocked Short-Wide LQ factorization of a -*> M-by-N matrix A, where N >= M: -*> A = L * Q +*> SLASWLQ computes a blocked Tall-Skinny LQ factorization of +*> a real M-by-N matrix A for M <= N: +*> +*> A = ( L 0 ) * Q, +*> +*> where: +*> +*> Q is a n-by-N orthogonal matrix, stored on exit in an implicit +*> form in the elements above the digonal of the array A and in +*> the elemenst of the array T; +*> L is an lower-triangular M-by-M matrix stored on exit in +*> the elements on and below the diagonal of the array A. +*> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored. +*> *> \endverbatim * * Arguments: @@ -150,10 +162,10 @@ SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, $ INFO) * -* -- LAPACK computational routine (version 3.8.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* November 2017 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT diff --git a/lapack-netlib/SRC/slasyf_aa.f b/lapack-netlib/SRC/slasyf_aa.f index ed4ef6291..76f632602 100644 --- a/lapack-netlib/SRC/slasyf_aa.f +++ b/lapack-netlib/SRC/slasyf_aa.f @@ -284,8 +284,9 @@ * * Swap A(I1, I2+1:M) with A(I2, I2+1:M) * - CALL SSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, - $ A( J1+I2-1, I2+1 ), LDA ) + IF( I2.LT.M ) + $ CALL SSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, + $ A( J1+I2-1, I2+1 ), LDA ) * * Swap A(I1, I1) with A(I2,I2) * @@ -325,13 +326,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( K, J+1 ).NE.ZERO ) THEN - ALPHA = ONE / A( K, J+1 ) - CALL SCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) - CALL SSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) - ELSE - CALL SLASET( 'Full', 1, M-J-1, ZERO, ZERO, - $ A( K, J+2 ), LDA) + IF( J.LT.(M-1) ) THEN + IF( A( K, J+1 ).NE.ZERO ) THEN + ALPHA = ONE / A( K, J+1 ) + CALL SCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) + CALL SSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) + ELSE + CALL SLASET( 'Full', 1, M-J-1, ZERO, ZERO, + $ A( K, J+2 ), LDA) + END IF END IF END IF J = J + 1 @@ -432,8 +435,9 @@ * * Swap A(I2+1:M, I1) with A(I2+1:M, I2) * - CALL SSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, - $ A( I2+1, J1+I2-1 ), 1 ) + IF( I2.LT.M ) + $ CALL SSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, + $ A( I2+1, J1+I2-1 ), 1 ) * * Swap A(I1, I1) with A(I2, I2) * @@ -473,13 +477,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( J+1, K ).NE.ZERO ) THEN - ALPHA = ONE / A( J+1, K ) - CALL SCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) - CALL SSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) - ELSE - CALL SLASET( 'Full', M-J-1, 1, ZERO, ZERO, - $ A( J+2, K ), LDA ) + IF( J.LT.(M-1) ) THEN + IF( A( J+1, K ).NE.ZERO ) THEN + ALPHA = ONE / A( J+1, K ) + CALL SCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) + CALL SSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) + ELSE + CALL SLASET( 'Full', M-J-1, 1, ZERO, ZERO, + $ A( J+2, K ), LDA ) + END IF END IF END IF J = J + 1 diff --git a/lapack-netlib/SRC/slasyf_rk.f b/lapack-netlib/SRC/slasyf_rk.f index b1b37177f..c16708365 100644 --- a/lapack-netlib/SRC/slasyf_rk.f +++ b/lapack-netlib/SRC/slasyf_rk.f @@ -321,7 +321,7 @@ * of A and working backwards, and compute the matrix W = U12*D * for use in updating A11 * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = ZERO @@ -649,7 +649,7 @@ * of A and working forwards, and compute the matrix W = L21*D * for use in updating A22 * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = ZERO * diff --git a/lapack-netlib/SRC/slatdf.f b/lapack-netlib/SRC/slatdf.f index 5496f9db4..495d32502 100644 --- a/lapack-netlib/SRC/slatdf.f +++ b/lapack-netlib/SRC/slatdf.f @@ -85,7 +85,7 @@ *> RHS is REAL array, dimension N. *> On entry, RHS contains contributions from other subsystems. *> On exit, RHS contains the solution of the subsystem with -*> entries acoording to the value of IJOB (see above). +*> entries according to the value of IJOB (see above). *> \endverbatim *> *> \param[in,out] RDSUM @@ -260,7 +260,7 @@ * * Solve for U-part, look-ahead for RHS(N) = +-1. This is not done * in BSOLVE and will hopefully give us a better estimate because -* any ill-conditioning of the original matrix is transfered to U +* any ill-conditioning of the original matrix is transferred to U * and not to L. U(N, N) is an approximation to sigma_min(LU). * CALL SCOPY( N-1, RHS, 1, XP, 1 ) diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f index d6d682799..b56b0d41e 100644 --- a/lapack-netlib/SRC/slatsqr.f +++ b/lapack-netlib/SRC/slatsqr.f @@ -1,3 +1,4 @@ +*> \brief \b SLATSQR * * Definition: * =========== @@ -19,8 +20,22 @@ *> \verbatim *> *> SLATSQR computes a blocked Tall-Skinny QR factorization of -*> an M-by-N matrix A, where M >= N: -*> A = Q * R . +*> a real M-by-N matrix A for M >= N: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix, stored on exit in an implicit +*> form in the elements below the digonal of the array A and in +*> the elemenst of the array T; +*> +*> R is an upper-triangular N-by-N matrix, stored on exit in +*> the elements on and above the diagonal of the array A. +*> +*> 0 is a (M-N)-by-N zero matrix, and is not stored. +*> *> \endverbatim * * Arguments: @@ -149,10 +164,10 @@ SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, $ LWORK, INFO) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f new file mode 100644 index 000000000..748760d63 --- /dev/null +++ b/lapack-netlib/SRC/sorgtsqr.f @@ -0,0 +1,306 @@ +*> \brief \b SORGTSQR +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SORGTSQR + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE SORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, +* $ INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGTSQR generates an M-by-N real matrix Q_out with orthonormal columns, +*> which are the first N columns of a product of real orthogonal +*> matrices of order M which are returned by SLATSQR +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> See the documentation for SLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by SLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by SLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not accessed. +*> The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by SLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored) (same format as the output A +*> below the diagonal in SLATSQR). +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of NIRB block reflector sequences +*> is stored in a larger NB-by-N column block of T and consists +*> of NICB smaller NB-by-NB upper-triangular column blocks. +*> (same format as the output T in SLATSQR). +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB1,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) REAL array, dimension (MAX(2,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. LWORK >= (M+NB)*N. +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup singleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SLAMTSQR, SLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + LQUERY = LWORK.EQ.-1 + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array C(LDC, N) and WORK(LWORK) +* in the call to DLAMTSQR. See the documentation for DLAMTSQR. +* + IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN + INFO = -10 + ELSE +* +* Set block size for column blocks +* + NBLOCAL = MIN( NB, N ) +* +* LWORK = -1, then set the size for the array C(LDC,N) +* in DLAMTSQR call and set the optimal size of the work array +* WORK(LWORK) in DLAMTSQR call. +* + LDC = M + LC = LDC*N + LW = N * NBLOCAL +* + LWORKOPT = LC+LW +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -10 + END IF + END IF +* + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SORGTSQR', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in +* of M-by-M orthogonal matrix Q_in, which is implicitly stored in +* the subdiagonal part of input array A and in the input array T. +* Perform by the following operation using the routine DLAMTSQR. +* +* Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix, +* ( 0 ) 0 is a (M-N)-by-N zero matrix. +* +* (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones +* on the diagonal and zeros elsewhere. +* + CALL SLASET( 'F', M, N, ZERO, ONE, WORK, LDC ) +* +* (1b) On input, WORK(1:LDC*N) stores ( I ); +* ( 0 ) +* +* On output, WORK(1:LDC*N) stores Q1_in. +* + CALL SLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT, + $ WORK, LDC, WORK( LC+1 ), LW, IINFO ) +* +* (2) Copy the result from the part of the work array (1:M,1:N) +* with the leading dimension LDC that starts at WORK(1) into +* the output array A(1:M,1:N) column-by-column. +* + DO J = 1, N + CALL SCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 ) + END DO +* + WORK( 1 ) = REAL( LWORKOPT ) + RETURN +* +* End of SORGTSQR +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/sorhr_col.f b/lapack-netlib/SRC/sorhr_col.f new file mode 100644 index 000000000..38976245c --- /dev/null +++ b/lapack-netlib/SRC/sorhr_col.f @@ -0,0 +1,439 @@ +*> \brief \b SORHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SORHR_COL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE SORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORHR_COL takes an M-by-N real matrix Q_in with orthonormal columns +*> as input, stored in A, and performs Householder Reconstruction (HR), +*> i.e. reconstructs Householder vectors V(i) implicitly representing +*> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S, +*> where S is an N-by-N diagonal matrix with diagonal entries +*> equal to +1 or -1. The Householder vectors (columns V(i) of V) are +*> stored in A on output, and the diagonal entries of S are stored in D. +*> Block reflectors are also returned in T +*> (same output format as SGEQRT). +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size to be used in the reconstruction +*> of Householder column vector blocks in the array A and +*> corresponding block reflectors in the array T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size.) +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> +*> The array A contains an M-by-N orthonormal matrix Q_in, +*> i.e the columns of A are orthogonal unit vectors. +*> +*> On exit: +*> +*> The elements below the diagonal of A represent the unit +*> lower-trapezoidal matrix V of Householder column vectors +*> V(i). The unit diagonal entries of V are not stored +*> (same format as the output below the diagonal in A from +*> SGEQRT). The matrix T and the matrix V stored on output +*> in A implicitly define Q_out. +*> +*> The elements above the diagonal contain the factor U +*> of the "modified" LU-decomposition: +*> Q_in - ( S ) = V * U +*> ( 0 ) +*> where 0 is a (M-N)-by-(M-N) zero matrix. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is REAL array, +*> dimension (LDT, N) +*> +*> Let NOCB = Number_of_output_col_blocks +*> = CEIL(N/NB) +*> +*> On exit, T(1:NB, 1:N) contains NOCB upper-triangular +*> block reflectors used to define Q_out stored in compact +*> form as a sequence of upper-triangular NB-by-NB column +*> blocks (same format as the output T in SGEQRT). +*> The matrix T and the matrix V stored on output in A +*> implicitly define Q_out. NOTE: The lower triangles +*> below the upper-triangular blcoks will be filled with +*> zeros. See Further Details. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is REAL array, dimension min(M,N). +*> The elements can be only plus or minus one. +*> +*> D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where +*> 1 <= i <= min(M,N), and Q_in_i is Q_in after performing +*> i-1 steps of “modified” Gaussian elimination. +*> See Further Details. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The computed M-by-M orthogonal factor Q_out is defined implicitly as +*> a product of orthogonal matrices Q_out(i). Each Q_out(i) is stored in +*> the compact WY-representation format in the corresponding blocks of +*> matrices V (stored in A) and T. +*> +*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N +*> matrix A contains the column vectors V(i) in NB-size column +*> blocks VB(j). For example, VB(1) contains the columns +*> V(1), V(2), ... V(NB). NOTE: The unit entries on +*> the diagonal of Y are not stored in A. +*> +*> The number of column blocks is +*> +*> NOCB = Number_of_output_col_blocks = CEIL(N/NB) +*> +*> where each block is of order NB except for the last block, which +*> is of order LAST_NB = N - (NOCB-1)*NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix V is +*> +*> +*> V = ( VB(1), VB(2), VB(3) ) = +*> +*> = ( 1 ) +*> ( v21 1 ) +*> ( v31 v32 1 ) +*> ( v41 v42 v43 1 ) +*> ( v51 v52 v53 v54 1 ) +*> ( v61 v62 v63 v54 v65 ) +*> +*> +*> For each of the column blocks VB(i), an upper-triangular block +*> reflector TB(i) is computed. These blocks are stored as +*> a sequence of upper-triangular column blocks in the NB-by-N +*> matrix T. The size of each TB(i) block is NB-by-NB, except +*> for the last block, whose size is LAST_NB-by-LAST_NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix T is +*> +*> T = ( TB(1), TB(2), TB(3) ) = +*> +*> = ( t11 t12 t13 t14 t15 ) +*> ( t22 t24 ) +*> +*> +*> The M-by-M factor Q_out is given as a product of NOCB +*> orthogonal M-by-M matrices Q_out(i). +*> +*> Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB), +*> +*> where each matrix Q_out(i) is given by the WY-representation +*> using corresponding blocks from the matrices V and T: +*> +*> Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T, +*> +*> where I is the identity matrix. Here is the formula with matrix +*> dimensions: +*> +*> Q(i){M-by-M} = I{M-by-M} - +*> VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M}, +*> +*> where INB = NB, except for the last block NOCB +*> for which INB=LAST_NB. +*> +*> ===== +*> NOTE: +*> ===== +*> +*> If Q_in is the result of doing a QR factorization +*> B = Q_in * R_in, then: +*> +*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out. +*> +*> So if one wants to interpret Q_out as the result +*> of the QR factorization of B, then corresponding R_out +*> should be obtained by R_out = S * R_in, i.e. some rows of R_in +*> should be multiplied by -1. +*> +*> For the details of the algorithm, see [1]. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup singleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. + REAL A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB, + $ NPLUSONE +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. N.GT.M ) THEN + INFO = -2 + ELSE IF( NB.LT.1 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -5 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -7 + END IF +* +* Handle error in the input parameters. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SORHR_COL', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + RETURN + END IF +* +* On input, the M-by-N matrix A contains the orthogonal +* M-by-N matrix Q_in. +* +* (1) Compute the unit lower-trapezoidal V (ones on the diagonal +* are not stored) by performing the "modified" LU-decomposition. +* +* Q_in - ( S ) = V * U = ( V1 ) * U, +* ( 0 ) ( V2 ) +* +* where 0 is an (M-N)-by-N zero matrix. +* +* (1-1) Factor V1 and U. + + CALL SLAORHR_COL_GETRFNP( N, N, A, LDA, D, IINFO ) +* +* (1-2) Solve for V2. +* + IF( M.GT.N ) THEN + CALL STRSM( 'R', 'U', 'N', 'N', M-N, N, ONE, A, LDA, + $ A( N+1, 1 ), LDA ) + END IF +* +* (2) Reconstruct the block reflector T stored in T(1:NB, 1:N) +* as a sequence of upper-triangular blocks with NB-size column +* blocking. +* +* Loop over the column blocks of size NB of the array A(1:M,1:N) +* and the array T(1:NB,1:N), JB is the column index of a column +* block, JNB is the column block size at each step JB. +* + NPLUSONE = N + 1 + DO JB = 1, N, NB +* +* (2-0) Determine the column block size JNB. +* + JNB = MIN( NPLUSONE-JB, NB ) +* +* (2-1) Copy the upper-triangular part of the current JNB-by-JNB +* diagonal block U(JB) (of the N-by-N matrix U) stored +* in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part +* of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1) +* column-by-column, total JNB*(JNB+1)/2 elements. +* + JBTEMP1 = JB - 1 + DO J = JB, JB+JNB-1 + CALL SCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 ) + END DO +* +* (2-2) Perform on the upper-triangular part of the current +* JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored +* in T(1:JNB,JB:JB+JNB-1) the following operation in place: +* (-1)*U(JB)*S(JB), i.e the result will be stored in the upper- +* triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication +* of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB +* diagonal block S(JB) of the N-by-N sign matrix S from the +* right means changing the sign of each J-th column of the block +* U(JB) according to the sign of the diagonal element of the block +* S(JB), i.e. S(J,J) that is stored in the array element D(J). +* + DO J = JB, JB+JNB-1 + IF( D( J ).EQ.ONE ) THEN + CALL SSCAL( J-JBTEMP1, -ONE, T( 1, J ), 1 ) + END IF + END DO +* +* (2-3) Perform the triangular solve for the current block +* matrix X(JB): +* +* X(JB) * (A(JB)**T) = B(JB), where: +* +* A(JB)**T is a JNB-by-JNB unit upper-triangular +* coefficient block, and A(JB)=V1(JB), which +* is a JNB-by-JNB unit lower-triangular block +* stored in A(JB:JB+JNB-1,JB:JB+JNB-1). +* The N-by-N matrix V1 is the upper part +* of the M-by-N lower-trapezoidal matrix V +* stored in A(1:M,1:N); +* +* B(JB) is a JNB-by-JNB upper-triangular right-hand +* side block, B(JB) = (-1)*U(JB)*S(JB), and +* B(JB) is stored in T(1:JNB,JB:JB+JNB-1); +* +* X(JB) is a JNB-by-JNB upper-triangular solution +* block, X(JB) is the upper-triangular block +* reflector T(JB), and X(JB) is stored +* in T(1:JNB,JB:JB+JNB-1). +* +* In other words, we perform the triangular solve for the +* upper-triangular block T(JB): +* +* T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB). +* +* Even though the blocks X(JB) and B(JB) are upper- +* triangular, the routine STRSM will access all JNB**2 +* elements of the square T(1:JNB,JB:JB+JNB-1). Therefore, +* we need to set to zero the elements of the block +* T(1:JNB,JB:JB+JNB-1) below the diagonal before the call +* to STRSM. +* +* (2-3a) Set the elements to zero. +* + JBTEMP2 = JB - 2 + DO J = JB, JB+JNB-2 + DO I = J-JBTEMP2, NB + T( I, J ) = ZERO + END DO + END DO +* +* (2-3b) Perform the triangular solve. +* + CALL STRSM( 'R', 'L', 'T', 'U', JNB, JNB, ONE, + $ A( JB, JB ), LDA, T( 1, JB ), LDT ) +* + END DO +* + RETURN +* +* End of SORHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/sporfsx.f b/lapack-netlib/SRC/sporfsx.f index 52fab6976..ce8c26569 100644 --- a/lapack-netlib/SRC/sporfsx.f +++ b/lapack-netlib/SRC/sporfsx.f @@ -135,7 +135,7 @@ *> \param[in,out] S *> \verbatim *> S is REAL array, dimension (N) -*> The row scale factors for A. If EQUED = 'Y', A is multiplied on +*> The scale factors for A. If EQUED = 'Y', A is multiplied on *> the left and right by diag(S). S is an input argument if FACT = *> 'F'; otherwise, S is an output argument. If FACT = 'F' and EQUED *> = 'Y', each element of S must be positive. If S is output, each @@ -263,7 +263,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -299,14 +299,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -314,9 +314,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/sposvxx.f b/lapack-netlib/SRC/sposvxx.f index 3cdfa749c..fa2c0d3f3 100644 --- a/lapack-netlib/SRC/sposvxx.f +++ b/lapack-netlib/SRC/sposvxx.f @@ -366,7 +366,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -402,14 +402,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -417,9 +417,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/ssb2st_kernels.f b/lapack-netlib/SRC/ssb2st_kernels.f index 54479f89e..08859169b 100644 --- a/lapack-netlib/SRC/ssb2st_kernels.f +++ b/lapack-netlib/SRC/ssb2st_kernels.f @@ -1,26 +1,26 @@ *> \brief \b SSB2ST_KERNELS * * @generated from zhb2st_kernels.f, fortran z -> s, Wed Dec 7 08:22:40 2016 -* +* * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download SSB2ST_KERNELS + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download SSB2ST_KERNELS + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, * ST, ED, SWEEP, N, NB, IB, * A, LDA, V, TAU, LDVT, WORK) * @@ -32,9 +32,9 @@ * INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. -* REAL A( LDA, * ), V( * ), +* REAL A( LDA, * ), V( * ), * TAU( * ), WORK( * ) -* +* *> \par Purpose: * ============= *> @@ -124,7 +124,7 @@ *> LDVT is INTEGER. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is REAL array. Workspace of size nb. *> \endverbatim @@ -150,7 +150,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -158,16 +158,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, + SUBROUTINE SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, $ ST, ED, SWEEP, N, NB, IB, $ A, LDA, V, TAU, LDVT, WORK) * @@ -184,7 +184,7 @@ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. - REAL A( LDA, * ), V( * ), + REAL A( LDA, * ), V( * ), $ TAU( * ), WORK( * ) * .. * @@ -198,8 +198,8 @@ * .. Local Scalars .. LOGICAL UPPER INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, - $ DPOS, OFDPOS, AJETER - REAL CTMP + $ DPOS, OFDPOS, AJETER + REAL CTMP * .. * .. External Subroutines .. EXTERNAL SLARFG, SLARFX, SLARFY @@ -212,7 +212,7 @@ * .. * .. * .. Executable Statements .. -* +* AJETER = IB + LDVT UPPER = LSAME( UPLO, 'U' ) @@ -243,10 +243,10 @@ V( VPOS ) = ONE DO 10 I = 1, LM-1 V( VPOS+I ) = ( A( OFDPOS-I, ST+I ) ) - A( OFDPOS-I, ST+I ) = ZERO + A( OFDPOS-I, ST+I ) = ZERO 10 CONTINUE CTMP = ( A( OFDPOS, ST ) ) - CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, + CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) A( OFDPOS, ST ) = CTMP * @@ -284,14 +284,14 @@ * V( VPOS ) = ONE DO 30 I = 1, LM-1 - V( VPOS+I ) = + V( VPOS+I ) = $ ( A( DPOS-NB-I, J1+I ) ) A( DPOS-NB-I, J1+I ) = ZERO 30 CONTINUE CTMP = ( A( DPOS-NB, J1 ) ) CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) A( DPOS-NB, J1 ) = CTMP -* +* CALL SLARFX( 'Right', LN-1, LM, V( VPOS ), $ TAU( TAUPOS ), $ A( DPOS-NB+1, J1 ), LDA-1, WORK) @@ -299,9 +299,9 @@ ENDIF * * Lower case -* +* ELSE -* +* IF( WANTZ ) THEN VPOS = MOD( SWEEP-1, 2 ) * N + ST TAUPOS = MOD( SWEEP-1, 2 ) * N + ST @@ -316,9 +316,9 @@ V( VPOS ) = ONE DO 20 I = 1, LM-1 V( VPOS+I ) = A( OFDPOS+I, ST-1 ) - A( OFDPOS+I, ST-1 ) = ZERO + A( OFDPOS+I, ST-1 ) = ZERO 20 CONTINUE - CALL SLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + CALL SLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * LM = ED - ST + 1 @@ -345,7 +345,7 @@ LM = J2-J1+1 * IF( LM.GT.0) THEN - CALL SLARFX( 'Right', LM, LN, V( VPOS ), + CALL SLARFX( 'Right', LM, LN, V( VPOS ), $ TAU( TAUPOS ), A( DPOS+NB, ST ), $ LDA-1, WORK) * @@ -362,13 +362,13 @@ V( VPOS+I ) = A( DPOS+NB+I, ST ) A( DPOS+NB+I, ST ) = ZERO 40 CONTINUE - CALL SLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + CALL SLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * - CALL SLARFX( 'Left', LM, LN-1, V( VPOS ), + CALL SLARFX( 'Left', LM, LN-1, V( VPOS ), $ ( TAU( TAUPOS ) ), $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) - + ENDIF ENDIF ENDIF @@ -377,4 +377,4 @@ * * END OF SSB2ST_KERNELS * - END + END diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f index 3408810bd..22f96729e 100644 --- a/lapack-netlib/SRC/ssbgvx.f +++ b/lapack-netlib/SRC/ssbgvx.f @@ -261,11 +261,11 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit -*> < 0 : if INFO = -i, the i-th argument had an illegal value +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value *> <= N: if INFO = i, then i eigenvectors failed to converge. *> Their indices are stored in IFAIL. -*> > N : SPBSTF returned an error code; i.e., +*> > N: SPBSTF returned an error code; i.e., *> if INFO = N + i, for 1 <= i <= N, then the leading *> minor of order i of B is not positive definite. *> The factorization of B could not be completed and diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f index 228538161..d550f87e0 100644 --- a/lapack-netlib/SRC/sstemr.f +++ b/lapack-netlib/SRC/sstemr.f @@ -233,13 +233,13 @@ *> \param[in,out] TRYRAC *> \verbatim *> TRYRAC is LOGICAL -*> If TRYRAC.EQ..TRUE., indicates that the code should check whether +*> If TRYRAC = .TRUE., indicates that the code should check whether *> the tridiagonal matrix defines its eigenvalues to high relative *> accuracy. If so, the code uses relative-accuracy preserving *> algorithms that might be (a bit) slower depending on the matrix. *> If the matrix does not define its eigenvalues to high relative *> accuracy, the code can uses possibly faster algorithms. -*> If TRYRAC.EQ..FALSE., the code is not required to guarantee +*> If TRYRAC = .FALSE., the code is not required to guarantee *> relatively accurate eigenvalues and can use the fastest possible *> techniques. *> On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix diff --git a/lapack-netlib/SRC/ssyconvf.f b/lapack-netlib/SRC/ssyconvf.f index d43b9473f..c6f08428f 100644 --- a/lapack-netlib/SRC/ssyconvf.f +++ b/lapack-netlib/SRC/ssyconvf.f @@ -291,7 +291,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -344,7 +344,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -435,7 +435,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where k increases from 1 to N * I = 1 @@ -488,7 +488,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/ssyconvf_rook.f b/lapack-netlib/SRC/ssyconvf_rook.f index 833b9c632..a7e0d5258 100644 --- a/lapack-netlib/SRC/ssyconvf_rook.f +++ b/lapack-netlib/SRC/ssyconvf_rook.f @@ -282,7 +282,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -333,7 +333,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -423,7 +423,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where i increases from 1 to N * I = 1 @@ -474,7 +474,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/ssyev_2stage.f b/lapack-netlib/SRC/ssyev_2stage.f index 166766919..5d354c1b3 100644 --- a/lapack-netlib/SRC/ssyev_2stage.f +++ b/lapack-netlib/SRC/ssyev_2stage.f @@ -317,7 +317,7 @@ IF( .NOT.WANTZ ) THEN CALL SSTERF( N, W, WORK( INDE ), INFO ) ELSE -* Not available in this release, and agrument checking should not +* Not available in this release, and argument checking should not * let it getting here RETURN CALL SORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ), diff --git a/lapack-netlib/SRC/ssyevd_2stage.f b/lapack-netlib/SRC/ssyevd_2stage.f index 8ab90b641..625713b85 100644 --- a/lapack-netlib/SRC/ssyevd_2stage.f +++ b/lapack-netlib/SRC/ssyevd_2stage.f @@ -385,7 +385,7 @@ IF( .NOT.WANTZ ) THEN CALL SSTERF( N, W, WORK( INDE ), INFO ) ELSE -* Not available in this release, and agrument checking should not +* Not available in this release, and argument checking should not * let it getting here RETURN CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N, diff --git a/lapack-netlib/SRC/ssyrfsx.f b/lapack-netlib/SRC/ssyrfsx.f index b5dd0b2df..bfb7b6005 100644 --- a/lapack-netlib/SRC/ssyrfsx.f +++ b/lapack-netlib/SRC/ssyrfsx.f @@ -271,7 +271,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -307,14 +307,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -322,9 +322,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index e470f5883..7e58d1e75 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -42,7 +42,7 @@ *> matrices. *> *> Aasen's algorithm is used to factor A as -*> A = U * T * U**T, if UPLO = 'U', or +*> A = U**T * T * U, if UPLO = 'U', or *> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric tridiagonal. The factored @@ -86,7 +86,7 @@ *> *> On exit, if INFO = 0, the tridiagonal matrix T and the *> multipliers used to obtain the factor U or L from the -*> factorization A = U*T*U**T or A = L*T*L**T as computed by +*> factorization A = U**T*T*U or A = L*T*L**T as computed by *> SSYTRF. *> \endverbatim *> @@ -229,7 +229,7 @@ RETURN END IF * -* Compute the factorization A = U*T*U**T or A = L*T*L**T. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) IF( INFO.EQ.0 ) THEN diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index 43d937141..5e2e0e340 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -44,7 +44,7 @@ *> matrices. *> *> Aasen's 2-stage algorithm is used to factor A as -*> A = U * T * U**T, if UPLO = 'U', or +*> A = U**T * T * U, if UPLO = 'U', or *> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric and band. The matrix T is @@ -258,7 +258,7 @@ END IF * * -* Compute the factorization A = U*T*U**T or A = L*T*L**T. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, $ WORK, LWORK, INFO ) diff --git a/lapack-netlib/SRC/ssysvxx.f b/lapack-netlib/SRC/ssysvxx.f index 4762748c0..e2be0128b 100644 --- a/lapack-netlib/SRC/ssysvxx.f +++ b/lapack-netlib/SRC/ssysvxx.f @@ -377,7 +377,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -413,14 +413,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is REAL array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -428,9 +428,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/ssytf2_rk.f b/lapack-netlib/SRC/ssytf2_rk.f index bf113d1bd..400e48353 100644 --- a/lapack-netlib/SRC/ssytf2_rk.f +++ b/lapack-netlib/SRC/ssytf2_rk.f @@ -312,7 +312,7 @@ * * Factorize A as U*D*U**T using the upper triangle of A * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = ZERO @@ -623,7 +623,7 @@ * * Factorize A as L*D*L**T using the lower triangle of A * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = ZERO * diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f index 7ddc0224e..d2502f483 100644 --- a/lapack-netlib/SRC/ssytrd_2stage.f +++ b/lapack-netlib/SRC/ssytrd_2stage.f @@ -123,23 +123,22 @@ *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is REAL array, dimension LHOUS2, that -*> store the Householder representation of the stage2 +*> HOUS2 is REAL array, dimension (LHOUS2) +*> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim *> *> \param[in] LHOUS2 *> \verbatim *> LHOUS2 is INTEGER -*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array HOUS2. +*> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns *> this value as the first entry of the HOUS2 array, and no error *> message related to LHOUS2 is issued by XERBLA. -*> LHOUS2 = MAX(1, dimension) where -*> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> If VECT='N', LHOUS2 = max(1, 4*n); +*> if VECT='V', option not yet available. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index 0df1173e4..1d8c9f5c5 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -50,9 +50,9 @@ * Arguments: * ========== * -*> \param[in] STAGE +*> \param[in] STAGE1 *> \verbatim -*> STAGE is CHARACTER*1 +*> STAGE1 is CHARACTER*1 *> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the ssytrd_sy2sb routine *> was not called before this routine to reproduce AB. diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index 272876700..98169dc00 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -363,7 +363,7 @@ * * * Set the workspace of the triangular matrix T to zero once such a -* way everytime T is generated the upper/lower portion will be always zero +* way every time T is generated the upper/lower portion will be always zero * CALL SLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) * diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index 2c29475df..ae4550f28 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -39,7 +39,7 @@ *> the Bunch-Kaufman diagonal pivoting method. The form of the *> factorization is *> -*> A = U*D*U**T or A = L*D*L**T +*> A = U**T*D*U or A = L*D*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and D is symmetric and block diagonal with @@ -144,7 +144,7 @@ *> *> \verbatim *> -*> If UPLO = 'U', then A = U*D*U**T, where +*> If UPLO = 'U', then A = U**T*D*U, where *> U = P(n)*U(n)* ... *P(k)U(k)* ..., *> i.e., U is a product of terms P(k)*U(k), where k decreases from n to *> 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 @@ -262,7 +262,7 @@ * IF( UPPER ) THEN * -* Factorize A as U*D*U**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * * K is the main loop index, decreasing from N to 1 in steps of * KB, where KB is the number of columns factorized by SLASYF; diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index 4aaa978ad..7f428561c 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -37,7 +37,7 @@ *> SSYTRF_AA computes the factorization of a real symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a symmetric tridiagonal matrix. @@ -223,7 +223,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * * Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N)) @@ -256,7 +256,7 @@ $ A( MAX(1, J), J+1 ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J @@ -375,7 +375,7 @@ $ A( J+1, MAX(1, J) ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index 0e0f6edb7..03690815b 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -38,7 +38,7 @@ *> SSYTRF_AA_2STAGE computes the factorization of a real symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a symmetric band matrix with the @@ -275,7 +275,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * DO J = 0, NT-1 @@ -442,12 +442,14 @@ c END IF * > Apply pivots to previous columns of L CALL SSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, $ A( (J+1)*NB+1, I2 ), 1 ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL SSWAP( I2-I1-1, A( I1, I1+1 ), LDA, - $ A( I1+1, I2 ), 1 ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) + $ CALL SSWAP( I2-I1-1, A( I1, I1+1 ), LDA, + $ A( I1+1, I2 ), 1 ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL SSWAP( N-I2, A( I1, I2+1 ), LDA, - $ A( I2, I2+1 ), LDA ) + IF( I2.LT.N ) + $ CALL SSWAP( N-I2, A( I1, I2+1 ), LDA, + $ A( I2, I2+1 ), LDA ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) @@ -616,11 +618,13 @@ c END IF CALL SSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, $ A( I2, (J+1)*NB+1 ), LDA ) * > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL SSWAP( I2-I1-1, A( I1+1, I1 ), 1, - $ A( I2, I1+1 ), LDA ) + IF( I2.GT.(I1+1) ) + $ CALL SSWAP( I2-I1-1, A( I1+1, I1 ), 1, + $ A( I2, I1+1 ), LDA ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL SSWAP( N-I2, A( I2+1, I1 ), 1, - $ A( I2+1, I2 ), 1 ) + IF( I2.LT.N ) + $ CALL SSWAP( N-I2, A( I2+1, I1 ), 1, + $ A( I2+1, I2 ), 1 ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 4b9ea4e7b..897116c23 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -62,7 +62,7 @@ *> \param[in,out] A *> \verbatim *> A is REAL array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers +*> On entry, the block diagonal matrix D and the multipliers *> used to obtain the factor U or L as computed by SSYTRF. *> *> On exit, if INFO = 0, the (symmetric) inverse of the original @@ -82,7 +82,7 @@ *> \param[in] IPIV *> \verbatim *> IPIV is INTEGER array, dimension (N) -*> Details of the interchanges and the NB structure of D +*> Details of the interchanges and the block structure of D *> as determined by SSYTRF. *> \endverbatim *> diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f index b05c9f7e6..ed4377ae7 100644 --- a/lapack-netlib/SRC/ssytrs_aa.f +++ b/lapack-netlib/SRC/ssytrs_aa.f @@ -37,7 +37,7 @@ *> \verbatim *> *> SSYTRS_AA solves a system of linear equations A*X = B with a real -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by SSYTRF_AA. *> \endverbatim * @@ -49,7 +49,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -97,14 +97,16 @@ *> The leading dimension of the array B. LDB >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim -*> WORK is DOUBLE array, dimension (MAX(1,LWORK)) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER, LWORK >= MAX(1,3*N-2). +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -198,24 +200,31 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. +* +* 1) Forward substitution with U**T +* + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B * -* Pivot, P**T * B + K = 1 + DO WHILE ( K.LE.N ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K + 1 + END DO * - K = 1 - DO WHILE ( K.LE.N ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K + 1 - END DO +* Compute U**T \ B -> B [ (U**T \P**T * B) ] * -* Compute (U \P**T * B) -> B [ (U \P**T * B) ] + CALL STRSM( 'L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) + END IF * - CALL STRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) +* 2) Solve with triangular matrix T * -* Compute T \ B -> B [ T \ (U \P**T * B) ] +* Compute T \ B -> B [ T \ (U**T \P**T * B) ] * CALL SLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1) IF( N.GT.1 ) THEN @@ -224,41 +233,53 @@ END IF CALL SGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB, $ INFO) +* +* 3) Backward substitution with U +* + IF( N.GT.1 ) THEN * * -* Compute (U**T \ B) -> B [ U**T \ (T \ (U \P**T * B) ) ] +* Compute U \ B -> B [ U \ (T \ (U**T \P**T * B) ) ] * - CALL STRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B(2, 1), LDB) + CALL STRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B(2, 1), LDB) * -* Pivot, P * B [ P * (U**T \ (T \ (U \P**T * B) )) ] +* Pivot, P * B -> B [ P * (U \ (T \ (U**T \P**T * B) )) ] * - K = N - DO WHILE ( K.GE.1 ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K - 1 - END DO + K = N + DO WHILE ( K.GE.1 ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K - 1 + END DO + END IF * ELSE * * Solve A*X = B, where A = L*T*L**T. * -* Pivot, P**T * B +* 1) Forward substitution with L * - K = 1 - DO WHILE ( K.LE.N ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K + 1 - END DO + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B +* + K = 1 + DO WHILE ( K.LE.N ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K + 1 + END DO * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute L \ B -> B [ (L \P**T * B) ] +* + CALL STRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1), + $ LDA, B(2, 1), LDB) + END IF * - CALL STRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1), LDA, - $ B(2, 1), LDB) +* 2) Solve with triangular matrix T * * Compute T \ B -> B [ T \ (L \P**T * B) ] * @@ -270,20 +291,25 @@ CALL SGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB, $ INFO) * -* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] +* 3) Backward substitution with L**T * - CALL STRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) + IF( N.GT.1 ) THEN +* +* Compute L**T \ B -> B [ L**T \ (T \ (L \P**T * B) ) ] * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] + CALL STRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) * - K = N - DO WHILE ( K.GE.1 ) - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - K = K - 1 - END DO +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* + K = N + DO WHILE ( K.GE.1 ) + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + K = K - 1 + END DO + END IF * END IF * diff --git a/lapack-netlib/SRC/ssytrs_aa_2stage.f b/lapack-netlib/SRC/ssytrs_aa_2stage.f index d271b9481..cf2da529d 100644 --- a/lapack-netlib/SRC/ssytrs_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrs_aa_2stage.f @@ -36,7 +36,7 @@ *> \verbatim *> *> SSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a real -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by SSYTRF_AA_2STAGE. *> \endverbatim * @@ -48,7 +48,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -208,15 +208,15 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (U**T \P**T * B) -> B [ (U**T \P**T * B) ] +* Compute (U**T \ B) -> B [ (U**T \P**T * B) ] * CALL STRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) @@ -234,7 +234,7 @@ CALL STRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (U \ (T \ (U**T \P**T * B) )) ] +* Pivot, P * B -> B [ P * (U \ (T \ (U**T \P**T * B) )) ] * CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * @@ -246,11 +246,11 @@ * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute (L \ B) -> B [ (L \P**T * B) ] * CALL STRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) @@ -268,7 +268,7 @@ CALL STRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] * CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * diff --git a/lapack-netlib/SRC/stgsy2.f b/lapack-netlib/SRC/stgsy2.f index ca9946a7e..2814889fc 100644 --- a/lapack-netlib/SRC/stgsy2.f +++ b/lapack-netlib/SRC/stgsy2.f @@ -71,7 +71,7 @@ *> R * B**T + L * E**T = scale * -F *> *> This case is used to compute an estimate of Dif[(A, D), (B, E)] = -*> sigma_min(Z) using reverse communicaton with SLACON. +*> sigma_min(Z) using reverse communication with SLACON. *> *> STGSY2 also (IJOB >= 1) contributes to the computation in STGSYL *> of an upper bound on the separation between to matrix pairs. Then @@ -85,7 +85,7 @@ *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 -*> = 'N', solve the generalized Sylvester equation (1). +*> = 'N': solve the generalized Sylvester equation (1). *> = 'T': solve the 'transposed' system (3). *> \endverbatim *> diff --git a/lapack-netlib/SRC/stgsyl.f b/lapack-netlib/SRC/stgsyl.f index cd597f37d..ff634b1de 100644 --- a/lapack-netlib/SRC/stgsyl.f +++ b/lapack-netlib/SRC/stgsyl.f @@ -88,20 +88,20 @@ *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 -*> = 'N', solve the generalized Sylvester equation (1). -*> = 'T', solve the 'transposed' system (3). +*> = 'N': solve the generalized Sylvester equation (1). +*> = 'T': solve the 'transposed' system (3). *> \endverbatim *> *> \param[in] IJOB *> \verbatim *> IJOB is INTEGER *> Specifies what kind of functionality to be performed. -*> =0: solve (1) only. -*> =1: The functionality of 0 and 3. -*> =2: The functionality of 0 and 4. -*> =3: Only an estimate of Dif[(A,D), (B,E)] is computed. +*> = 0: solve (1) only. +*> = 1: The functionality of 0 and 3. +*> = 2: The functionality of 0 and 4. +*> = 3: Only an estimate of Dif[(A,D), (B,E)] is computed. *> (look ahead strategy IJOB = 1 is used). -*> =4: Only an estimate of Dif[(A,D), (B,E)] is computed. +*> = 4: Only an estimate of Dif[(A,D), (B,E)] is computed. *> ( SGECON on sub-systems is used ). *> Not referenced if TRANS = 'T'. *> \endverbatim diff --git a/lapack-netlib/SRC/stpmlqt.f b/lapack-netlib/SRC/stpmlqt.f index 565dadd0c..8fc7823c2 100644 --- a/lapack-netlib/SRC/stpmlqt.f +++ b/lapack-netlib/SRC/stpmlqt.f @@ -94,7 +94,7 @@ *> *> \param[in] V *> \verbatim -*> V is REAL array, dimension (LDA,K) +*> V is REAL array, dimension (LDV,K) *> The i-th row must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> DTPLQT in B. See Further Details. diff --git a/lapack-netlib/SRC/stpmqrt.f b/lapack-netlib/SRC/stpmqrt.f index b1813b7dd..6a5cbb981 100644 --- a/lapack-netlib/SRC/stpmqrt.f +++ b/lapack-netlib/SRC/stpmqrt.f @@ -94,7 +94,7 @@ *> *> \param[in] V *> \verbatim -*> V is REAL array, dimension (LDA,K) +*> V is REAL array, dimension (LDV,K) *> The i-th column must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> CTPQRT in B. See Further Details. diff --git a/lapack-netlib/SRC/stprfb.f b/lapack-netlib/SRC/stprfb.f index 66e67252f..fcd164183 100644 --- a/lapack-netlib/SRC/stprfb.f +++ b/lapack-netlib/SRC/stprfb.f @@ -152,8 +152,8 @@ *> \verbatim *> LDA is INTEGER *> The leading dimension of the array A. -*> If SIDE = 'L', LDC >= max(1,K); -*> If SIDE = 'R', LDC >= max(1,M). +*> If SIDE = 'L', LDA >= max(1,K); +*> If SIDE = 'R', LDA >= max(1,M). *> \endverbatim *> *> \param[in,out] B diff --git a/lapack-netlib/SRC/zcgesv.f b/lapack-netlib/SRC/zcgesv.f index bb12d4f3a..b71018638 100644 --- a/lapack-netlib/SRC/zcgesv.f +++ b/lapack-netlib/SRC/zcgesv.f @@ -93,9 +93,9 @@ *> dimension (LDA,N) *> On entry, the N-by-N coefficient matrix A. *> On exit, if iterative refinement has been successfully used -*> (INFO.EQ.0 and ITER.GE.0, see description below), then A is +*> (INFO = 0 and ITER >= 0, see description below), then A is *> unchanged, if double precision factorization has been used -*> (INFO.EQ.0 and ITER.LT.0, see description below), then the +*> (INFO = 0 and ITER < 0, see description below), then the *> array A contains the factors L and U from the factorization *> A = P*L*U; the unit diagonal elements of L are not stored. *> \endverbatim @@ -112,8 +112,8 @@ *> The pivot indices that define the permutation matrix P; *> row i of the matrix was interchanged with row IPIV(i). *> Corresponds either to the single precision factorization -*> (if INFO.EQ.0 and ITER.GE.0) or the double precision -*> factorization (if INFO.EQ.0 and ITER.LT.0). +*> (if INFO = 0 and ITER >= 0) or the double precision +*> factorization (if INFO = 0 and ITER < 0). *> \endverbatim *> *> \param[in] B @@ -421,7 +421,7 @@ 30 CONTINUE * * If we are at this place of the code, this is because we have -* performed ITER=ITERMAX iterations and never satisified the stopping +* performed ITER=ITERMAX iterations and never satisfied the stopping * criterion, set up the ITER flag accordingly and follow up on double * precision routine. * diff --git a/lapack-netlib/SRC/zcposv.f b/lapack-netlib/SRC/zcposv.f index eafcce623..101d25f5d 100644 --- a/lapack-netlib/SRC/zcposv.f +++ b/lapack-netlib/SRC/zcposv.f @@ -111,9 +111,9 @@ *> elements need not be set and are assumed to be zero. *> *> On exit, if iterative refinement has been successfully used -*> (INFO.EQ.0 and ITER.GE.0, see description below), then A is +*> (INFO = 0 and ITER >= 0, see description below), then A is *> unchanged, if double precision factorization has been used -*> (INFO.EQ.0 and ITER.LT.0, see description below), then the +*> (INFO = 0 and ITER < 0, see description below), then the *> array A contains the factor U or L from the Cholesky *> factorization A = U**H*U or A = L*L**H. *> \endverbatim @@ -431,7 +431,7 @@ 30 CONTINUE * * If we are at this place of the code, this is because we have -* performed ITER=ITERMAX iterations and never satisified the +* performed ITER=ITERMAX iterations and never satisfied the * stopping criterion, set up the ITER flag accordingly and follow * up on double precision routine. * diff --git a/lapack-netlib/SRC/zgbrfsx.f b/lapack-netlib/SRC/zgbrfsx.f index e40d7d23e..872709899 100644 --- a/lapack-netlib/SRC/zgbrfsx.f +++ b/lapack-netlib/SRC/zgbrfsx.f @@ -75,7 +75,7 @@ *> Specifies the form of the system of equations: *> = 'N': A * X = B (No transpose) *> = 'T': A**T * X = B (Transpose) -*> = 'C': A**H * X = B (Conjugate transpose = Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) *> \endverbatim *> *> \param[in] EQUED @@ -308,7 +308,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -344,14 +344,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -359,9 +359,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/zgbsvxx.f b/lapack-netlib/SRC/zgbsvxx.f index 9ba9c2ee3..0d916fd62 100644 --- a/lapack-netlib/SRC/zgbsvxx.f +++ b/lapack-netlib/SRC/zgbsvxx.f @@ -431,7 +431,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -467,14 +467,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -482,9 +482,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/zgebak.f b/lapack-netlib/SRC/zgebak.f index a9761fde2..70c265e05 100644 --- a/lapack-netlib/SRC/zgebak.f +++ b/lapack-netlib/SRC/zgebak.f @@ -48,10 +48,10 @@ *> \verbatim *> JOB is CHARACTER*1 *> Specifies the type of backward transformation required: -*> = 'N', do nothing, return immediately; -*> = 'P', do backward transformation for permutation only; -*> = 'S', do backward transformation for scaling only; -*> = 'B', do backward transformations for both permutation and +*> = 'N': do nothing, return immediately; +*> = 'P': do backward transformation for permutation only; +*> = 'S': do backward transformation for scaling only; +*> = 'B': do backward transformations for both permutation and *> scaling. *> JOB must be the same as the argument JOB supplied to ZGEBAL. *> \endverbatim diff --git a/lapack-netlib/SRC/zgeev.f b/lapack-netlib/SRC/zgeev.f index 22b04469f..1ba542587 100644 --- a/lapack-netlib/SRC/zgeev.f +++ b/lapack-netlib/SRC/zgeev.f @@ -157,7 +157,7 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, the QR algorithm failed to compute all the *> eigenvalues, and no eigenvectors have been computed; -*> elements and i+1:N of W contain eigenvalues which have +*> elements i+1:N of W contain eigenvalues which have *> converged. *> \endverbatim * diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index d553da90b..91a20416e 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -80,13 +80,13 @@ *> desirable, then this option is advisable. The input matrix A *> is preprocessed with QR factorization with FULL (row and *> column) pivoting. -*> = 'G' Computation as with 'F' with an additional estimate of the +*> = 'G': Computation as with 'F' with an additional estimate of the *> condition number of B, where A=B*D. If A has heavily weighted *> rows, then using this condition number gives too pessimistic *> error bound. *> = 'A': Small singular values are not well determined by the data *> and are considered as noisy; the matrix is treated as -*> numerically rank defficient. The error in the computed +*> numerically rank deficient. The error in the computed *> singular values is bounded by f(m,n)*epsilon*||A||. *> The computed SVD A = U * S * V^* restores A up to *> f(m,n)*epsilon*||A||. @@ -117,7 +117,7 @@ *> = 'V': N columns of V are returned in the array V; Jacobi rotations *> are not explicitly accumulated. *> = 'J': N columns of V are returned in the array V, but they are -*> computed as the product of Jacobi rotations, if JOBT .EQ. 'N'. +*> computed as the product of Jacobi rotations, if JOBT = 'N'. *> = 'W': V may be used as workspace of length N*N. See the description *> of V. *> = 'N': V is not computed. @@ -131,7 +131,7 @@ *> specified range. If A .NE. 0 is scaled so that the largest singular *> value of c*A is around SQRT(BIG), BIG=DLAMCH('O'), then JOBR issues *> the licence to kill columns of A whose norm in c*A is less than -*> SQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN, +*> SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, *> where SFMIN=DLAMCH('S'), EPSLN=DLAMCH('E'). *> = 'N': Do not kill small columns of c*A. This option assumes that *> BLAS and QR factorizations and triangular solvers are @@ -229,7 +229,7 @@ *> If JOBU = 'F', then U contains on exit the M-by-M matrix of *> the left singular vectors, including an ONB *> of the orthogonal complement of the Range(A). -*> If JOBU = 'W' .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N), +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), *> then U is used as workspace if the procedure *> replaces A with A^*. In that case, [V] is computed *> in U as left singular vectors of A^* and then @@ -251,7 +251,7 @@ *> V is COMPLEX*16 array, dimension ( LDV, N ) *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; -*> If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N), +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), *> then V is used as workspace if the pprocedure *> replaces A with A^*. In that case, [U] is computed *> in V as right singular vectors of A^* and then @@ -282,7 +282,7 @@ *> Length of CWORK to confirm proper allocation of workspace. *> LWORK depends on the job: *> -*> 1. If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and +*> 1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and *> 1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'): *> LWORK >= 2*N+1. This is the minimal requirement. *> ->> For optimal performance (blocked code) the optimal value @@ -298,9 +298,9 @@ *> In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ), *> N*N+LWORK(ZPOCON)). -*> 2. If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'), -*> (JOBU.EQ.'N') -*> 2.1 .. no scaled condition estimate requested (JOBE.EQ.'N'): +*> 2. If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> (JOBU = 'N') +*> 2.1 .. no scaled condition estimate requested (JOBE = 'N'): *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance, *> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, @@ -318,10 +318,10 @@ *> LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ), *> N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)). *> 3. If SIGMA and the left singular vectors are needed -*> 3.1 .. no scaled condition estimate requested (JOBE.EQ.'N'): +*> 3.1 .. no scaled condition estimate requested (JOBE = 'N'): *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance: -*> if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, *> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR. *> In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(ZGEQP3), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). @@ -329,15 +329,15 @@ *> required (JOBA='E', or 'G'). *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance: -*> if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, *> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR. *> In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZPOCON), *> 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). -*> 4. If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and -*> 4.1. if JOBV.EQ.'V' +*> 4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> 4.1. if JOBV = 'V' *> the minimal requirement is LWORK >= 5*N+2*N*N. -*> 4.2. if JOBV.EQ.'J' the minimal requirement is +*> 4.2. if JOBV = 'J' the minimal requirement is *> LWORK >= 4*N+N*N. *> In both cases, the allocated CWORK can accommodate blocked runs *> of ZGEQP3, ZGEQRF, ZGELQF, SUNMQR, ZUNMLQ. @@ -356,7 +356,7 @@ *> of A. (See the description of SVA().) *> RWORK(2) = See the description of RWORK(1). *> RWORK(3) = SCONDA is an estimate for the condition number of -*> column equilibrated A. (If JOBA .EQ. 'E' or 'G') +*> column equilibrated A. (If JOBA = 'E' or 'G') *> SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). *> It is computed using SPOCON. It holds *> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA @@ -375,7 +375,7 @@ *> triangular factor in the first QR factorization. *> RWORK(5) = an estimate of the scaled condition number of the *> triangular factor in the second QR factorization. -*> The following two parameters are computed if JOBT .EQ. 'T'. +*> The following two parameters are computed if JOBT = 'T'. *> They are provided for a developer/implementer who is familiar *> with the details of the method. *> RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy @@ -456,23 +456,23 @@ *> of JOBA and JOBR. *> IWORK(2) = the number of the computed nonzero singular values *> IWORK(3) = if nonzero, a warning message: -*> If IWORK(3).EQ.1 then some of the column norms of A +*> If IWORK(3) = 1 then some of the column norms of A *> were denormalized floats. The requested high accuracy *> is not warranted by the data. -*> IWORK(4) = 1 or -1. If IWORK(4) .EQ. 1, then the procedure used A^* to +*> IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to *> do the job as specified by the JOB parameters. -*> If the call to ZGEJSV is a workspace query (indicated by LWORK .EQ. -1 or -*> LRWORK .EQ. -1), then on exit IWORK(1) contains the required length of +*> If the call to ZGEJSV is a workspace query (indicated by LWORK = -1 or +*> LRWORK = -1), then on exit IWORK(1) contains the required length of *> IWORK for the job parameters used in the call. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> < 0 : if INFO = -i, then the i-th argument had an illegal value. -*> = 0 : successful exit; -*> > 0 : ZGEJSV did not converge in the maximal allowed number -*> of sweeps. The computed values may be inaccurate. +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: ZGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. *> \endverbatim * * Authors: @@ -1338,7 +1338,7 @@ IF ( L2ABER ) THEN * Standard absolute error bound suffices. All sigma_i with * sigma_i < N*EPSLN*||A|| are flushed to zero. This is an -* agressive enforcement of lower numerical rank by introducing a +* aggressive enforcement of lower numerical rank by introducing a * backward error of the order of N*EPSLN*||A||. TEMP1 = SQRT(DBLE(N))*EPSLN DO 3001 p = 2, N @@ -1350,7 +1350,7 @@ 3001 CONTINUE 3002 CONTINUE ELSE IF ( L2RANK ) THEN -* .. similarly as above, only slightly more gentle (less agressive). +* .. similarly as above, only slightly more gentle (less aggressive). * Sudden drop on the diagonal of R1 is used as the criterion for * close-to-rank-deficient. TEMP1 = SQRT(SFMIN) @@ -1720,7 +1720,7 @@ CALL ZPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1, $ CWORK(2*N+NR*NR+1),RWORK,IERR) CONDR1 = ONE / SQRT(TEMP1) -* .. here need a second oppinion on the condition number +* .. here need a second opinion on the condition number * .. then assume worst case scenario * R1 is OK for inverse <=> CONDR1 .LT. DBLE(N) * more conservative <=> CONDR1 .LT. SQRT(DBLE(N)) @@ -1765,7 +1765,7 @@ ELSE * * .. ill-conditioned case: second QRF with pivoting -* Note that windowed pivoting would be equaly good +* Note that windowed pivoting would be equally good * numerically, and more run-time efficient. So, in * an optimal implementation, the next call to ZGEQP3 * should be replaced with eg. CALL ZGEQPX (ACM TOMS #782) @@ -1823,7 +1823,7 @@ * IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 -* (this overwrittes the copy of R2, as it will not be +* (this overwrites the copy of R2, as it will not be * needed in this branch, but it does not overwritte the * Huseholder vectors of Q2.). CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) @@ -2079,7 +2079,7 @@ * * This branch deploys a preconditioned Jacobi SVD with explicitly * accumulated rotations. It is included as optional, mainly for -* experimental purposes. It does perfom well, and can also be used. +* experimental purposes. It does perform well, and can also be used. * In this implementation, this branch will be automatically activated * if the condition number sigma_max(A) / sigma_min(A) is predicted * to be greater than the overflow threshold. This is because the diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index 656396536..4e7e7e38e 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -1,3 +1,4 @@ +*> \brief \b ZGELQ * * Definition: * =========== @@ -17,7 +18,17 @@ * ============= *> *> \verbatim -*> ZGELQ computes a LQ factorization of an M-by-N matrix A. +*> +*> ZGELQ computes an LQ factorization of a complex M-by-N matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -138,7 +149,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -159,10 +170,10 @@ SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/zgelq2.f b/lapack-netlib/SRC/zgelq2.f index 188c8f8c8..a825ac17b 100644 --- a/lapack-netlib/SRC/zgelq2.f +++ b/lapack-netlib/SRC/zgelq2.f @@ -33,8 +33,16 @@ *> *> \verbatim *> -*> ZGELQ2 computes an LQ factorization of a complex m by n matrix A: -*> A = L * Q. +*> ZGELQ2 computes an LQ factorization of a complex m-by-n matrix A: +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a n-by-n orthogonal matrix; +*> L is an lower-triangular m-by-m matrix; +*> 0 is a m-by-(n-m) zero matrix, if m < n. +*> *> \endverbatim * * Arguments: @@ -96,7 +104,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complex16GEcomputational * @@ -121,10 +129,10 @@ * ===================================================================== SUBROUTINE ZGELQ2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f index 8d9341a61..3a5e5fd4a 100644 --- a/lapack-netlib/SRC/zgelqf.f +++ b/lapack-netlib/SRC/zgelqf.f @@ -34,7 +34,15 @@ *> \verbatim *> *> ZGELQF computes an LQ factorization of a complex M-by-N matrix A: -*> A = L * Q. +*> +*> A = ( L 0 ) * Q +*> +*> where: +*> +*> Q is a N-by-N orthogonal matrix; +*> L is an lower-triangular M-by-M matrix; +*> 0 is a M-by-(N-M) zero matrix, if M < N. +*> *> \endverbatim * * Arguments: @@ -110,7 +118,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complex16GEcomputational * @@ -135,10 +143,10 @@ * ===================================================================== SUBROUTINE ZGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f index aa07e0feb..6fb2be3d8 100644 --- a/lapack-netlib/SRC/zgemlq.f +++ b/lapack-netlib/SRC/zgemlq.f @@ -1,3 +1,4 @@ +*> \brief \b ZGEMLQ * * Definition: * =========== @@ -142,7 +143,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f index 32f1bf4d5..aec9321bb 100644 --- a/lapack-netlib/SRC/zgemqr.f +++ b/lapack-netlib/SRC/zgemqr.f @@ -1,3 +1,4 @@ +*> \brief \b ZGEMQR * * Definition: * =========== @@ -144,7 +145,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f index 1aa457f56..cea686b98 100644 --- a/lapack-netlib/SRC/zgeqr.f +++ b/lapack-netlib/SRC/zgeqr.f @@ -1,3 +1,4 @@ +*> \brief \b ZGEQR * * Definition: * =========== @@ -17,7 +18,18 @@ * ============= *> *> \verbatim -*> ZGEQR computes a QR factorization of an M-by-N matrix A. +*> +*> ZGEQR computes a QR factorization of a complex M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -138,7 +150,7 @@ *> \verbatim *> *> These details are particular for this LAPACK implementation. Users should not -*> take them for granted. These details may change in the future, and are unlikely not +*> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. *> @@ -160,10 +172,10 @@ SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, TSIZE, LWORK diff --git a/lapack-netlib/SRC/zgeqr2.f b/lapack-netlib/SRC/zgeqr2.f index d2774d788..0384c1d42 100644 --- a/lapack-netlib/SRC/zgeqr2.f +++ b/lapack-netlib/SRC/zgeqr2.f @@ -33,8 +33,17 @@ *> *> \verbatim *> -*> ZGEQR2 computes a QR factorization of a complex m by n matrix A: -*> A = Q * R. +*> ZGEQR2 computes a QR factorization of a complex m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -96,7 +105,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complex16GEcomputational * @@ -121,10 +130,10 @@ * ===================================================================== SUBROUTINE ZGEQR2( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/zgeqr2p.f b/lapack-netlib/SRC/zgeqr2p.f index 0e5e55486..7bbd81da9 100644 --- a/lapack-netlib/SRC/zgeqr2p.f +++ b/lapack-netlib/SRC/zgeqr2p.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> ZGEQR2P computes a QR factorization of a complex m by n matrix A: -*> A = Q * R. The diagonal entries of R are real and nonnegative. +*> ZGEQR2P computes a QR factorization of a complex m-by-n matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a m-by-m orthogonal matrix; +*> R is an upper-triangular n-by-n matrix with nonnegative diagonal +*> entries; +*> 0 is a (m-n)-by-n zero matrix, if m > n. +*> *> \endverbatim * * Arguments: @@ -97,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complex16GEcomputational * @@ -124,10 +134,10 @@ * ===================================================================== SUBROUTINE ZGEQR2P( M, N, A, LDA, TAU, WORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/lapack-netlib/SRC/zgeqrf.f b/lapack-netlib/SRC/zgeqrf.f index 3ea1e71e1..2c03ebe73 100644 --- a/lapack-netlib/SRC/zgeqrf.f +++ b/lapack-netlib/SRC/zgeqrf.f @@ -34,7 +34,16 @@ *> \verbatim *> *> ZGEQRF computes a QR factorization of a complex M-by-N matrix A: -*> A = Q * R. +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -111,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complex16GEcomputational * @@ -136,10 +145,10 @@ * ===================================================================== SUBROUTINE ZGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f index cdc4bfa94..80ead21ca 100644 --- a/lapack-netlib/SRC/zgeqrfp.f +++ b/lapack-netlib/SRC/zgeqrfp.f @@ -33,8 +33,18 @@ *> *> \verbatim *> -*> ZGEQRFP computes a QR factorization of a complex M-by-N matrix A: -*> A = Q * R. The diagonal entries of R are real and nonnegative. +*> ZGEQR2P computes a QR factorization of a complex M-by-N matrix A: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix; +*> R is an upper-triangular N-by-N matrix with nonnegative diagonal +*> entries; +*> 0 is a (M-N)-by-N zero matrix, if M > N. +*> *> \endverbatim * * Arguments: @@ -112,7 +122,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 +*> \date November 2019 * *> \ingroup complex16GEcomputational * @@ -139,10 +149,10 @@ * ===================================================================== SUBROUTINE ZGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, LWORK, M, N diff --git a/lapack-netlib/SRC/zgerfsx.f b/lapack-netlib/SRC/zgerfsx.f index 5aabe50ed..3af7f8b6b 100644 --- a/lapack-netlib/SRC/zgerfsx.f +++ b/lapack-netlib/SRC/zgerfsx.f @@ -74,7 +74,7 @@ *> Specifies the form of the system of equations: *> = 'N': A * X = B (No transpose) *> = 'T': A**T * X = B (Transpose) -*> = 'C': A**H * X = B (Conjugate transpose = Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) *> \endverbatim *> *> \param[in] EQUED @@ -283,7 +283,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -319,14 +319,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -334,9 +334,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/zgesc2.f b/lapack-netlib/SRC/zgesc2.f index 72ef99dba..cdf15e4f4 100644 --- a/lapack-netlib/SRC/zgesc2.f +++ b/lapack-netlib/SRC/zgesc2.f @@ -91,7 +91,7 @@ *> \verbatim *> SCALE is DOUBLE PRECISION *> On exit, SCALE contains the scale factor. SCALE is chosen -*> 0 <= SCALE <= 1 to prevent owerflow in the solution. +*> 0 <= SCALE <= 1 to prevent overflow in the solution. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zgesvdq.f b/lapack-netlib/SRC/zgesvdq.f new file mode 100644 index 000000000..e0fb920bb --- /dev/null +++ b/lapack-netlib/SRC/zgesvdq.f @@ -0,0 +1,1389 @@ +*> \brief ZGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGESVDQ + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, +* S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, +* CWORK, LCWORK, RWORK, LRWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV +* INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK, +* INFO +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * ) +* DOUBLE PRECISION S( * ), RWORK( * ) +* INTEGER IWORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +* ZCGESVDQ computes the singular value decomposition (SVD) of a complex +*> M-by-N matrix A, where M >= N. The SVD of A is written as +*> [++] [xx] [x0] [xx] +*> A = U * SIGMA * V^*, [++] = [xx] * [ox] * [xx] +*> [++] [xx] +*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal +*> matrix, and V is an N-by-N unitary matrix. The diagonal elements +*> of SIGMA are the singular values of A. The columns of U and V are the +*> left and the right singular vectors of A, respectively. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy in the computed SVD +*> = 'A' The requested accuracy corresponds to having the backward +*> error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F, +*> where EPS = DLAMCH('Epsilon'). This authorises ZGESVDQ to +*> truncate the computed triangular factor in a rank revealing +*> QR factorization whenever the truncated part is below the +*> threshold of the order of EPS * ||A||_F. This is aggressive +*> truncation level. +*> = 'M' Similarly as with 'A', but the truncation is more gentle: it +*> is allowed only when there is a drop on the diagonal of the +*> triangular factor in the QR factorization. This is medium +*> truncation level. +*> = 'H' High accuracy requested. No numerical rank determination based +*> on the rank revealing QR factorization is attempted. +*> = 'E' Same as 'H', and in addition the condition number of column +*> scaled A is estimated and returned in RWORK(1). +*> N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1) +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> = 'P' The rows of A are ordered in decreasing order with respect to +*> ||A(i,:)||_\infty. This enhances numerical accuracy at the cost +*> of extra data movement. Recommended for numerical robustness. +*> = 'N' No row pivoting. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> = 'T' After the initial pivoted QR factorization, ZGESVD is applied to +*> the adjoint R**H of the computed triangular factor R. This involves +*> some extra data movement (matrix transpositions). Useful for +*> experiments, research and development. +*> = 'N' The triangular factor R is given as input to CGESVD. This may be +*> preferred as it involves less data movement. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> = 'A' All M left singular vectors are computed and returned in the +*> matrix U. See the description of U. +*> = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned +*> in the matrix U. See the description of U. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular +*> vectors are computed and returned in the matrix U. +*> = 'F' The N left singular vectors are returned in factored form as the +*> product of the Q factor from the initial QR factorization and the +*> N left singular vectors of (R**H , 0)**H. If row pivoting is used, +*> then the necessary information on the row pivoting is stored in +*> IWORK(N+1:N+M-1). +*> = 'N' The left singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> = 'A', 'V' All N right singular vectors are computed and returned in +*> the matrix V. +*> = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular +*> vectors are computed and returned in the matrix V. This option is +*> allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal. +*> = 'N' The right singular vectors are not computed. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array of dimensions LDA x N +*> On entry, the input matrix A. +*> On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains +*> the Householder vectors as stored by ZGEQP3. If JOBU = 'F', these Householder +*> vectors together with CWORK(1:N) can be used to restore the Q factors from +*> the initial pivoted QR factorization of A. See the description of U. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER. +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array of dimension N. +*> The singular values of A, ordered so that S(i) >= S(i+1). +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is COMPLEX*16 array, dimension +*> LDU x M if JOBU = 'A'; see the description of LDU. In this case, +*> on exit, U contains the M left singular vectors. +*> LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this +*> case, U contains the leading N or the leading NUMRANK left singular vectors. +*> LDU x N if JOBU = 'F' ; see the description of LDU. In this case U +*> contains N x N unitary matrix that can be used to form the left +*> singular vectors. +*> If JOBU = 'N', U is not referenced. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER. +*> The leading dimension of the array U. +*> If JOBU = 'A', 'S', 'U', 'R', LDU >= max(1,M). +*> If JOBU = 'F', LDU >= max(1,N). +*> Otherwise, LDU >= 1. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is COMPLEX*16 array, dimension +*> LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' . +*> If JOBV = 'A', or 'V', V contains the N-by-N unitary matrix V**H; +*> If JOBV = 'R', V contains the first NUMRANK rows of V**H (the right +*> singular vectors, stored rowwise, of the NUMRANK largest singular values). +*> If JOBV = 'N' and JOBA = 'E', V is used as a workspace. +*> If JOBV = 'N', and JOBA.NE.'E', V is not referenced. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If JOBV = 'A', 'V', 'R', or JOBA = 'E', LDV >= max(1,N). +*> Otherwise, LDV >= 1. +*> \endverbatim +*> +*> \param[out] NUMRANK +*> \verbatim +*> NUMRANK is INTEGER +*> NUMRANK is the numerical rank first determined after the rank +*> revealing QR factorization, following the strategy specified by the +*> value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK +*> leading singular values and vectors are then requested in the call +*> of CGESVD. The final value of NUMRANK might be further reduced if +*> some singular values are computed as zeros. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (max(1, LIWORK)). +*> On exit, IWORK(1:N) contains column pivoting permutation of the +*> rank revealing QR factorization. +*> If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence +*> of row swaps used in row pivoting. These can be used to restore the +*> left singular vectors in the case JOBU = 'F'. +* +*> If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> LIWORK(1) returns the minimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> LIWORK is INTEGER +*> The dimension of the array IWORK. +*> LIWORK >= N + M - 1, if JOBP = 'P'; +*> LIWORK >= N if JOBP = 'N'. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the CWORK, IWORK, and RWORK arrays, and no error +*> message related to LCWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] CWORK +*> \verbatim +*> CWORK is COMPLEX*12 array, dimension (max(2, LCWORK)), used as a workspace. +*> On exit, if, on entry, LCWORK.NE.-1, CWORK(1:N) contains parameters +*> needed to recover the Q factor from the QR factorization computed by +*> ZGEQP3. +* +*> If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> CWORK(1) returns the optimal LCWORK, and +*> CWORK(2) returns the minimal LCWORK. +*> \endverbatim +*> +*> \param[in,out] LCWORK +*> \verbatim +*> LCWORK is INTEGER +*> The dimension of the array CWORK. It is determined as follows: +*> Let LWQP3 = N+1, LWCON = 2*N, and let +*> LWUNQ = { MAX( N, 1 ), if JOBU = 'R', 'S', or 'U' +*> { MAX( M, 1 ), if JOBU = 'A' +*> LWSVD = MAX( 3*N, 1 ) +*> LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 3*(N/2), 1 ), LWUNLQ = MAX( N, 1 ), +*> LWQRF = MAX( N/2, 1 ), LWUNQ2 = MAX( N, 1 ) +*> Then the minimal value of LCWORK is: +*> = MAX( N + LWQP3, LWSVD ) if only the singular values are needed; +*> = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed, +*> and a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWUNQ ) if the singular values and the left +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the singular values and the left +*> singular vectors are requested, and also +*> a scaled condition estimate requested; +*> +*> = N + MAX( LWQP3, LWSVD ) if the singular values and the right +*> singular vectors are requested; +*> = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right +*> singular vectors are requested, and also +*> a scaled condition etimate requested; +*> +*> = N + MAX( LWQP3, LWSVD, LWUNQ ) if the full SVD is requested with JOBV = 'R'; +*> independent of JOBR; +*> = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the full SVD is requested, +*> JOBV = 'R' and, also a scaled condition +*> estimate requested; independent of JOBR; +*> = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ) ) if the +*> full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ ) ) +*> if the full SVD is requested with JOBV = 'A' or 'V', and +*> JOBR ='N', and also a scaled condition number estimate +*> requested. +*> = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) ) if the +*> full SVD is requested with JOBV = 'A', 'V', and JOBR ='T' +*> = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ), +*> N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) ) +*> if the full SVD is requested with JOBV = 'A', 'V' and +*> JOBR ='T', and also a scaled condition number estimate +*> requested. +*> Finally, LCWORK must be at least two: LCWORK = MAX( 2, LCWORK ). +*> +*> If LCWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the CWORK, IWORK, and RWORK arrays, and no error +*> message related to LCWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (max(1, LRWORK)). +*> On exit, +*> 1. If JOBA = 'E', RWORK(1) contains an estimate of the condition +*> number of column scaled A. If A = C * D where D is diagonal and C +*> has unit columns in the Euclidean norm, then, assuming full column rank, +*> N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1). +*> Otherwise, RWORK(1) = -1. +*> 2. RWORK(2) contains the number of singular values computed as +*> exact zeros in ZGESVD applied to the upper triangular or trapeziodal +*> R (from the initial QR factorization). In case of early exit (no call to +*> ZGESVD, such as in the case of zero matrix) RWORK(2) = -1. +* +*> If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0, +*> RWORK(1) returns the minimal LRWORK. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER. +*> The dimension of the array RWORK. +*> If JOBP ='P', then LRWORK >= MAX(2, M, 5*N); +*> Otherwise, LRWORK >= MAX(2, 5*N). +* +*> If LRWORK = -1, then a workspace query is assumed; the routine +*> only calculates and returns the optimal and minimal sizes +*> for the CWORK, IWORK, and RWORK arrays, and no error +*> message related to LCWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> > 0: if ZBDSQR did not converge, INFO specifies how many superdiagonals +*> of an intermediate bidiagonal form B (computed in ZGESVD) did not +*> converge to zero. +*> \endverbatim +* +*> \par Further Details: +* ======================== +*> +*> \verbatim +*> +*> 1. The data movement (matrix transpose) is coded using simple nested +*> DO-loops because BLAS and LAPACK do not provide corresponding subroutines. +*> Those DO-loops are easily identified in this source code - by the CONTINUE +*> statements labeled with 11**. In an optimized version of this code, the +*> nested DO loops should be replaced with calls to an optimized subroutine. +*> 2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause +*> column norm overflow. This is the minial precaution and it is left to the +*> SVD routine (CGESVD) to do its own preemptive scaling if potential over- +*> or underflows are detected. To avoid repeated scanning of the array A, +*> an optimal implementation would do all necessary scaling before calling +*> CGESVD and the scaling in CGESVD can be switched off. +*> 3. Other comments related to code optimization are given in comments in the +*> code, enlosed in [[double brackets]]. +*> \endverbatim +* +*> \par Bugs, examples and comments +* =========================== +* +*> \verbatim +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> \endverbatim +* +*> \par References +* =============== +* +*> \verbatim +*> [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for +*> Computing the SVD with High Accuracy. ACM Trans. Math. Softw. +*> 44(1): 11:1-11:30 (2017) +*> +*> SIGMA library, xGESVDQ section updated February 2016. +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> Developed and coded by Zlatko Drmac, Department of Mathematics +*> University of Zagreb, Croatia, drmac@math.hr +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2018 +* +*> \ingroup complex16GEsing +* +* ===================================================================== + SUBROUTINE ZGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA, + $ S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK, + $ CWORK, LCWORK, RWORK, LRWORK, INFO ) +* .. Scalar Arguments .. + IMPLICIT NONE + CHARACTER JOBA, JOBP, JOBR, JOBU, JOBV + INTEGER M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK, + $ INFO +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * ) + DOUBLE PRECISION S( * ), RWORK( * ) + INTEGER IWORK( * ) +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = (0.0D0,0.0D0), CONE = (1.0D0,0.0D0) ) +* .. +* .. Local Scalars .. + INTEGER IERR, NR, N1, OPTRATIO, p, q + INTEGER LWCON, LWQP3, LWRK_ZGELQF, LWRK_ZGESVD, LWRK_ZGESVD2, + $ LWRK_ZGEQP3, LWRK_ZGEQRF, LWRK_ZUNMLQ, LWRK_ZUNMQR, + $ LWRK_ZUNMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWUNQ, + $ LWUNQ2, LWUNLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2, + $ IMINWRK, RMINWRK + LOGICAL ACCLA, ACCLM, ACCLH, ASCALED, CONDA, DNTWU, DNTWV, + $ LQUERY, LSVC0, LSVEC, ROWPRM, RSVEC, RTRANS, WNTUA, + $ WNTUF, WNTUR, WNTUS, WNTVA, WNTVR + DOUBLE PRECISION BIG, EPSLN, RTMP, SCONDA, SFMIN + COMPLEX*16 CTMP +* .. +* .. Local Arrays + COMPLEX*16 CDUMMY(1) + DOUBLE PRECISION RDUMMY(1) +* .. +* .. External Subroutines (BLAS, LAPACK) + EXTERNAL ZGELQF, ZGEQP3, ZGEQRF, ZGESVD, ZLACPY, ZLAPMT, + $ ZLASCL, ZLASET, ZLASWP, ZDSCAL, DLASET, DLASCL, + $ ZPOCON, ZUNMLQ, ZUNMQR, XERBLA +* .. +* .. External Functions (BLAS, LAPACK) + LOGICAL LSAME + INTEGER IDAMAX + DOUBLE PRECISION ZLANGE, DZNRM2, DLAMCH + EXTERNAL LSAME, ZLANGE, IDAMAX, DZNRM2, DLAMCH +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX, MIN, DBLE, SQRT +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + WNTUS = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' ) + WNTUR = LSAME( JOBU, 'R' ) + WNTUA = LSAME( JOBU, 'A' ) + WNTUF = LSAME( JOBU, 'F' ) + LSVC0 = WNTUS .OR. WNTUR .OR. WNTUA + LSVEC = LSVC0 .OR. WNTUF + DNTWU = LSAME( JOBU, 'N' ) +* + WNTVR = LSAME( JOBV, 'R' ) + WNTVA = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' ) + RSVEC = WNTVR .OR. WNTVA + DNTWV = LSAME( JOBV, 'N' ) +* + ACCLA = LSAME( JOBA, 'A' ) + ACCLM = LSAME( JOBA, 'M' ) + CONDA = LSAME( JOBA, 'E' ) + ACCLH = LSAME( JOBA, 'H' ) .OR. CONDA +* + ROWPRM = LSAME( JOBP, 'P' ) + RTRANS = LSAME( JOBR, 'T' ) +* + IF ( ROWPRM ) THEN + IMINWRK = MAX( 1, N + M - 1 ) + RMINWRK = MAX( 2, M, 5*N ) + ELSE + IMINWRK = MAX( 1, N ) + RMINWRK = MAX( 2, 5*N ) + END IF + LQUERY = (LIWORK .EQ. -1 .OR. LCWORK .EQ. -1 .OR. LRWORK .EQ. -1) + INFO = 0 + IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN + INFO = -1 + ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = -2 + ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN + INFO = -3 + ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN + INFO = -4 + ELSE IF ( WNTUR .AND. WNTVA ) THEN + INFO = -5 + ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN + INFO = -5 + ELSE IF ( M.LT.0 ) THEN + INFO = -6 + ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN + INFO = -7 + ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN + INFO = -9 + ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR. + $ ( WNTUF .AND. LDU.LT.N ) ) THEN + INFO = -12 + ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR. + $ ( CONDA .AND. LDV.LT.N ) ) THEN + INFO = -14 + ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN + INFO = -17 + END IF +* +* + IF ( INFO .EQ. 0 ) THEN +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LCWORK are written with a lot of redundancy and +* can be simplified. However, this detailed form is easier for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for ZGEQP3 of an M x N matrix + LWQP3 = N+1 +* .. minimal workspace length for ZUNMQR to build left singular vectors + IF ( WNTUS .OR. WNTUR ) THEN + LWUNQ = MAX( N , 1 ) + ELSE IF ( WNTUA ) THEN + LWUNQ = MAX( M , 1 ) + END IF +* .. minimal workspace length for ZPOCON of an N x N matrix + LWCON = 2 * N +* .. ZGESVD of an N x N matrix + LWSVD = MAX( 3 * N, 1 ) + IF ( LQUERY ) THEN + CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_ZGEQP3 = INT( CDUMMY(1) ) + IF ( WNTUS .OR. WNTUR ) THEN + CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) + ELSE IF ( WNTUA ) THEN + CALL ZUNMQR( 'L', 'N', M, M, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) + ELSE + LWRK_ZUNMQR = 0 + END IF + END IF + MINWRK = 2 + OPTWRK = 2 + IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN +* .. minimal and optimal sizes of the complex workspace if +* only the singular values are requested + IF ( CONDA ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + CALL ZGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_ZGESVD = INT( CDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, LWRK_ZGESVD ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, LWRK_ZGESVD ) + END IF + END IF + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the left singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD, LWUNQ ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL ZGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + ELSE + CALL ZGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + END IF + LWRK_ZGESVD = INT( CDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, LWRK_ZGESVD, + $ LWRK_ZUNMQR ) + ELSE + OPTWRK = N + MAX( LWRK_ZGEQP3, LWRK_ZGESVD, + $ LWRK_ZUNMQR ) + END IF + END IF + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the right singular vectors are requested + IF ( CONDA ) THEN + MINWRK = N + MAX( LWQP3, LWCON, LWSVD ) + ELSE + MINWRK = N + MAX( LWQP3, LWSVD ) + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL ZGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + ELSE + CALL ZGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + END IF + LWRK_ZGESVD = INT( CDUMMY(1) ) + IF ( CONDA ) THEN + OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, LWRK_ZGESVD ) + ELSE + OPTWRK = N + MAX( LWRK_ZGEQP3, LWRK_ZGESVD ) + END IF + END IF + ELSE +* .. minimal and optimal sizes of the complex workspace if the +* full SVD is requested + IF ( RTRANS ) THEN + MINWRK = MAX( LWQP3, LWSVD, LWUNQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N x N/2 ZGEQRF + LWQRF = MAX( N/2, 1 ) +* .. minimal workspace lengt for N/2 x N/2 ZGESVD + LWSVD2 = MAX( 3 * (N/2), 1 ) + LWUNQ2 = MAX( N, 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, + $ N/2+LWUNQ2, LWUNQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + ELSE + MINWRK = MAX( LWQP3, LWSVD, LWUNQ ) + IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON ) + MINWRK = MINWRK + N + IF ( WNTVA ) THEN +* .. minimal workspace length for N/2 x N ZGELQF + LWLQF = MAX( N/2, 1 ) + LWSVD2 = MAX( 3 * (N/2), 1 ) + LWUNLQ = MAX( N , 1 ) + MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, + $ N/2+LWUNLQ, LWUNQ ) + IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON ) + MINWRK2 = N + MINWRK2 + MINWRK = MAX( MINWRK, MINWRK2 ) + END IF + END IF + IF ( LQUERY ) THEN + IF ( RTRANS ) THEN + CALL ZGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_ZGESVD = INT( CDUMMY(1) ) + OPTWRK = MAX(LWRK_ZGEQP3,LWRK_ZGESVD,LWRK_ZUNMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL ZGEQRF(N,N/2,U,LDU,CDUMMY,CDUMMY,-1,IERR) + LWRK_ZGEQRF = INT( CDUMMY(1) ) + CALL ZGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_ZGESVD2 = INT( CDUMMY(1) ) + CALL ZUNMQR( 'R', 'C', N, N, N/2, U, LDU, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_ZUNMQR2 = INT( CDUMMY(1) ) + OPTWRK2 = MAX( LWRK_ZGEQP3, N/2+LWRK_ZGEQRF, + $ N/2+LWRK_ZGESVD2, N/2+LWRK_ZUNMQR2 ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + ELSE + CALL ZGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_ZGESVD = INT( CDUMMY(1) ) + OPTWRK = MAX(LWRK_ZGEQP3,LWRK_ZGESVD,LWRK_ZUNMQR) + IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON ) + OPTWRK = N + OPTWRK + IF ( WNTVA ) THEN + CALL ZGELQF(N/2,N,U,LDU,CDUMMY,CDUMMY,-1,IERR) + LWRK_ZGELQF = INT( CDUMMY(1) ) + CALL ZGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU, + $ V, LDV, CDUMMY, -1, RDUMMY, IERR ) + LWRK_ZGESVD2 = INT( CDUMMY(1) ) + CALL ZUNMLQ( 'R', 'N', N, N, N/2, U, LDU, CDUMMY, + $ V, LDV, CDUMMY,-1,IERR ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) + OPTWRK2 = MAX( LWRK_ZGEQP3, N/2+LWRK_ZGELQF, + $ N/2+LWRK_ZGESVD2, N/2+LWRK_ZUNMLQ ) + IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON ) + OPTWRK2 = N + OPTWRK2 + OPTWRK = MAX( OPTWRK, OPTWRK2 ) + END IF + END IF + END IF + END IF +* + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( 2, OPTWRK ) + IF ( LCWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19 +* + END IF +* + IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN + INFO = -21 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGESVDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +* +* Return optimal workspace +* + IWORK(1) = IMINWRK + CWORK(1) = OPTWRK + CWORK(2) = MINWRK + RWORK(1) = RMINWRK + RETURN + END IF +* +* Quick return if the matrix is void. +* + IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN +* .. all output is void. + RETURN + END IF +* + BIG = DLAMCH('O') + ASCALED = .FALSE. + IF ( ROWPRM ) THEN +* .. reordering the rows in decreasing sequence in the +* ell-infinity norm - this enhances numerical robustness in +* the case of differently scaled rows. + DO 1904 p = 1, M +* RWORK(p) = ABS( A(p,IZAMAX(N,A(p,1),LDA)) ) +* [[ZLANGE will return NaN if an entry of the p-th row is Nan]] + RWORK(p) = ZLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY ) +* .. check for NaN's and Inf's + IF ( ( RWORK(p) .NE. RWORK(p) ) .OR. + $ ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN + INFO = -8 + CALL XERBLA( 'ZGESVDQ', -INFO ) + RETURN + END IF + 1904 CONTINUE + DO 1952 p = 1, M - 1 + q = IDAMAX( M-p+1, RWORK(p), 1 ) + p - 1 + IWORK(N+p) = q + IF ( p .NE. q ) THEN + RTMP = RWORK(p) + RWORK(p) = RWORK(q) + RWORK(q) = RTMP + END IF + 1952 CONTINUE +* + IF ( RWORK(1) .EQ. ZERO ) THEN +* Quick return: A is the M x N zero matrix. + NUMRANK = 0 + CALL DLASET( 'G', N, 1, ZERO, ZERO, S, N ) + IF ( WNTUS ) CALL ZLASET('G', M, N, CZERO, CONE, U, LDU) + IF ( WNTUA ) CALL ZLASET('G', M, M, CZERO, CONE, U, LDU) + IF ( WNTVA ) CALL ZLASET('G', N, N, CZERO, CONE, V, LDV) + IF ( WNTUF ) THEN + CALL ZLASET( 'G', N, 1, CZERO, CZERO, CWORK, N ) + CALL ZLASET( 'G', M, N, CZERO, CONE, U, LDU ) + END IF + DO 5001 p = 1, N + IWORK(p) = p + 5001 CONTINUE + IF ( ROWPRM ) THEN + DO 5002 p = N + 1, N + M - 1 + IWORK(p) = p - N + 5002 CONTINUE + END IF + IF ( CONDA ) RWORK(1) = -1 + RWORK(2) = -1 + RETURN + END IF +* + IF ( RWORK(1) .GT. BIG / SQRT(DBLE(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL ZLASCL('G',0,0,SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + CALL ZLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 ) + END IF +* +* .. At this stage, preemptive scaling is done only to avoid column +* norms overflows during the QR factorization. The SVD procedure should +* have its own scaling to save the singular values from overflows and +* underflows. That depends on the SVD procedure. +* + IF ( .NOT.ROWPRM ) THEN + RTMP = ZLANGE( 'M', M, N, A, LDA, RWORK ) + IF ( ( RTMP .NE. RTMP ) .OR. + $ ( (RTMP*ZERO) .NE. ZERO ) ) THEN + INFO = -8 + CALL XERBLA( 'ZGESVDQ', -INFO ) + RETURN + END IF + IF ( RTMP .GT. BIG / SQRT(DBLE(M)) ) THEN +* .. to prevent overflow in the QR factorization, scale the +* matrix by 1/sqrt(M) if too large entry detected + CALL ZLASCL('G',0,0, SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR) + ASCALED = .TRUE. + END IF + END IF +* +* .. QR factorization with column pivoting +* +* A * P = Q * [ R ] +* [ 0 ] +* + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL ZGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LCWORK-N, + $ RWORK, IERR ) +* +* If the user requested accuracy level allows truncation in the +* computed upper triangular factor, the matrix R is examined and, +* if possible, replaced with its leading upper trapezoidal part. +* + EPSLN = DLAMCH('E') + SFMIN = DLAMCH('S') +* SMALL = SFMIN / EPSLN + NR = N +* + IF ( ACCLA ) THEN +* +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPS*||A||_F are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPS*||A||_F. + NR = 1 + RTMP = SQRT(DBLE(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002 + NR = NR + 1 + 3001 CONTINUE + 3002 CONTINUE +* + ELSEIF ( ACCLM ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R is used as the criterion for being +* close-to-rank-deficient. The threshold is set to EPSLN=DLAMCH('E'). +* [[This can be made more flexible by replacing this hard-coded value +* with a user specified threshold.]] Also, the values that underflow +* will be truncated. + NR = 1 + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* .. RRQR not authorized to determine numerical rank except in the +* obvious case of zero pivots. +* .. inspect R for exact zeros on the diagonal; +* R(i,i)=0 => R(i:N,i:N)=0. + NR = 1 + DO 3501 p = 2, N + IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502 + NR = NR + 1 + 3501 CONTINUE + 3502 CONTINUE +* + IF ( CONDA ) THEN +* Estimate the scaled condition number of A. Use the fact that it is +* the same as the scaled condition number of R. +* .. V is used as workspace + CALL ZLACPY( 'U', N, N, A, LDA, V, LDV ) +* Only the leading NR x NR submatrix of the triangular factor +* is considered. Only if NR=N will this give a reliable error +* bound. However, even for NR < N, this can be used on an +* expert level and obtain useful information in the sense of +* perturbation theory. + DO 3053 p = 1, NR + RTMP = DZNRM2( p, V(1,p), 1 ) + CALL ZDSCAL( p, ONE/RTMP, V(1,p), 1 ) + 3053 CONTINUE + IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN + CALL ZPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ CWORK, RWORK, IERR ) + ELSE + CALL ZPOCON( 'U', NR, V, LDV, ONE, RTMP, + $ CWORK(N+1), RWORK, IERR ) + END IF + SCONDA = ONE / SQRT(RTMP) +* For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1), +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +* See the reference [1] for more details. + END IF +* + ENDIF +* + IF ( WNTUR ) THEN + N1 = NR + ELSE IF ( WNTUS .OR. WNTUF) THEN + N1 = N + ELSE IF ( WNTUA ) THEN + N1 = M + END IF +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +*....................................................................... +* .. only the singular values are requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. compute the singular values of R**H = [A](1:NR,1:N)**H +* .. set the lower triangle of [A] to [A](1:NR,1:N)**H and +* the upper triangle of [A] to zero. + DO 1146 p = 1, MIN( N, NR ) + A(p,p) = CONJG(A(p,p)) + DO 1147 q = p + 1, N + A(q,p) = CONJG(A(p,q)) + IF ( q .LE. NR ) A(p,q) = CZERO + 1147 CONTINUE + 1146 CONTINUE +* + CALL ZGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU, + $ V, LDV, CWORK, LCWORK, RWORK, INFO ) +* + ELSE +* +* .. compute the singular values of R = [A](1:NR,1:N) +* + IF ( NR .GT. 1 ) + $ CALL ZLASET( 'L', NR-1,NR-1, CZERO,CZERO, A(2,1), LDA ) + CALL ZGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU, + $ V, LDV, CWORK, LCWORK, RWORK, INFO ) +* + END IF +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN +*....................................................................... +* .. the singular values and the left singular vectors requested +*......................................................................."""""""" + IF ( RTRANS ) THEN +* .. apply ZGESVD to R**H +* .. copy R**H into [U] and overwrite [U] with the right singular +* vectors of R + DO 1192 p = 1, NR + DO 1193 q = p, N + U(q,p) = CONJG(A(p,q)) + 1193 CONTINUE + 1192 CONTINUE + IF ( NR .GT. 1 ) + $ CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, U(1,2), LDU ) +* .. the left singular vectors not computed, the NR right singular +* vectors overwrite [U](1:NR,1:NR) as conjugate transposed. These +* will be pre-multiplied by Q to build the left singular vectors of A. + CALL ZGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1119 p = 1, NR + U(p,p) = CONJG(U(p,p)) + DO 1120 q = p + 1, NR + CTMP = CONJG(U(q,p)) + U(q,p) = CONJG(U(p,q)) + U(p,q) = CTMP + 1120 CONTINUE + 1119 CONTINUE +* + ELSE +* .. apply ZGESVD to R +* .. copy R into [U] and overwrite [U] with the left singular vectors + CALL ZLACPY( 'U', NR, N, A, LDA, U, LDU ) + IF ( NR .GT. 1 ) + $ CALL ZLASET( 'L', NR-1, NR-1, CZERO, CZERO, U(2,1), LDU ) +* .. the right singular vectors not computed, the NR left singular +* vectors overwrite [U](1:NR,1:NR) + CALL ZGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* .. now [U](1:NR,1:NR) contains the NR left singular vectors of +* R. These will be pre-multiplied by Q to build the left singular +* vectors of A. + END IF +* +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN + CALL ZLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL ZLASET( 'A',NR,N1-NR,CZERO,CZERO,U(1,NR+1), LDU ) + CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT.WNTUF ) + $ CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LCWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* + ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN +*....................................................................... +* .. the singular values and the right singular vectors requested +*....................................................................... + IF ( RTRANS ) THEN +* .. apply ZGESVD to R**H +* .. copy R**H into V and overwrite V with the left singular vectors + DO 1165 p = 1, NR + DO 1166 q = p, N + V(q,p) = CONJG(A(p,q)) + 1166 CONTINUE + 1165 CONTINUE + IF ( NR .GT. 1 ) + $ CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) +* .. the left singular vectors of R**H overwrite V, the right singular +* vectors not computed + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL ZGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1121 p = 1, NR + V(p,p) = CONJG(V(p,p)) + DO 1122 q = p + 1, NR + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1122 CONTINUE + 1121 CONTINUE +* + IF ( NR .LT. N ) THEN + DO 1103 p = 1, NR + DO 1104 q = NR + 1, N + V(p,q) = CONJG(V(q,p)) + 1104 CONTINUE + 1103 CONTINUE + END IF + CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK ) + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:N,1:NR) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the QR factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL ZLASET('G', N, N-NR, CZERO, CZERO, V(1,NR+1), LDV) + CALL ZGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1123 p = 1, N + V(p,p) = CONJG(V(p,p)) + DO 1124 q = p + 1, N + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1124 CONTINUE + 1123 CONTINUE + CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* + ELSE +* .. aply ZGESVD to R +* .. copy R into V and overwrite V with the right singular vectors + CALL ZLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL ZLASET( 'L', NR-1, NR-1, CZERO, CZERO, V(2,1), LDV ) +* .. the right singular vectors overwrite V, the NR left singular +* vectors stored in U(1:NR,1:NR) + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN + CALL ZGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H + ELSE +* .. need all N right singular vectors and NR < N +* [!] This is simple implementation that augments [V](1:NR,1:N) +* by padding a zero block. In the case NR << N, a more efficient +* way is to first use the LQ factorization. For more details +* how to implement this, see the " FULL SVD " branch. + CALL ZLASET('G', N-NR, N, CZERO,CZERO, V(NR+1,1), LDV) + CALL ZGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK ) + END IF +* .. now [V] contains the adjoint of the matrix of the right singular +* vectors of A. + END IF +* + ELSE +*....................................................................... +* .. FULL SVD requested +*....................................................................... + IF ( RTRANS ) THEN +* +* .. apply ZGESVD to R**H [[this option is left for R&D&T]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R**H into [V] and overwrite [V] with the left singular +* vectors of R**H + DO 1168 p = 1, NR + DO 1169 q = p, N + V(q,p) = CONJG(A(p,q)) + 1169 CONTINUE + 1168 CONTINUE + IF ( NR .GT. 1 ) + $ CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) +* +* .. the left singular vectors of R**H overwrite [V], the NR right +* singular vectors of R**H stored in [U](1:NR,1:NR) as conjugate +* transposed + CALL ZGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* .. assemble V + DO 1115 p = 1, NR + V(p,p) = CONJG(V(p,p)) + DO 1116 q = p + 1, NR + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1116 CONTINUE + 1115 CONTINUE + IF ( NR .LT. N ) THEN + DO 1101 p = 1, NR + DO 1102 q = NR+1, N + V(p,q) = CONJG(V(q,p)) + 1102 CONTINUE + 1101 CONTINUE + END IF + CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* + DO 1117 p = 1, NR + U(p,p) = CONJG(U(p,p)) + DO 1118 q = p + 1, NR + CTMP = CONJG(U(q,p)) + U(q,p) = CONJG(U(p,q)) + U(p,q) = CTMP + 1118 CONTINUE + 1117 CONTINUE +* + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL ZLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. copy R**H into [V] and overwrite [V] with the left singular +* vectors of R**H +* [[The optimal ratio N/NR for using QRF instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'ZGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO*NR .GT. N ) THEN + DO 1198 p = 1, NR + DO 1199 q = p, N + V(q,p) = CONJG(A(p,q)) + 1199 CONTINUE + 1198 CONTINUE + IF ( NR .GT. 1 ) + $ CALL ZLASET('U',NR-1,NR-1, CZERO,CZERO, V(1,2),LDV) +* + CALL ZLASET('A',N,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL ZGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV, + $ U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO ) +* + DO 1113 p = 1, N + V(p,p) = CONJG(V(p,p)) + DO 1114 q = p + 1, N + CTMP = CONJG(V(q,p)) + V(q,p) = CONJG(V(p,q)) + V(p,q) = CTMP + 1114 CONTINUE + 1113 CONTINUE + CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). +* + DO 1111 p = 1, N + U(p,p) = CONJG(U(p,p)) + DO 1112 q = p + 1, N + CTMP = CONJG(U(q,p)) + U(q,p) = CONJG(U(p,q)) + U(p,q) = CTMP + 1112 CONTINUE + 1111 CONTINUE +* + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL ZLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL ZLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU) + CALL ZLASET('A',M-N,N1-N,CZERO,CONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE +* .. copy R**H into [U] and overwrite [U] with the right +* singular vectors of R + DO 1196 p = 1, NR + DO 1197 q = p, N + U(q,NR+p) = CONJG(A(p,q)) + 1197 CONTINUE + 1196 CONTINUE + IF ( NR .GT. 1 ) + $ CALL ZLASET('U',NR-1,NR-1,CZERO,CZERO,U(1,NR+2),LDU) + CALL ZGEQRF( N, NR, U(1,NR+1), LDU, CWORK(N+1), + $ CWORK(N+NR+1), LCWORK-N-NR, IERR ) + DO 1143 p = 1, NR + DO 1144 q = 1, N + V(q,p) = CONJG(U(p,NR+q)) + 1144 CONTINUE + 1143 CONTINUE + CALL ZLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV) + CALL ZGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V,LDV, CWORK(N+NR+1),LCWORK-N-NR,RWORK, INFO ) + CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + CALL ZUNMQR('R','C', N, N, NR, U(1,NR+1), LDU, + $ CWORK(N+1),V,LDV,CWORK(N+NR+1),LCWORK-N-NR,IERR) + CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL ZLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF + END IF + END IF +* + ELSE +* +* .. apply ZGESVD to R [[this is the recommended option]] +* + IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN +* .. copy R into [V] and overwrite V with the right singular vectors + CALL ZLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL ZLASET( 'L', NR-1,NR-1, CZERO,CZERO, V(2,1), LDV ) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL ZGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK ) +* .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL ZLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF +* + ELSE +* .. need all N right singular vectors and NR < N +* .. the requested number of the left singular vectors +* is then N1 (N or M) +* [[The optimal ratio N/NR for using LQ instead of padding +* with zeros. Here hard coded to 2; it must be at least +* two due to work space constraints.]] +* OPTRATIO = ILAENV(6, 'ZGESVD', 'S' // 'O', NR,N,0,0) +* OPTRATIO = MAX( OPTRATIO, 2 ) + OPTRATIO = 2 + IF ( OPTRATIO * NR .GT. N ) THEN + CALL ZLACPY( 'U', NR, N, A, LDA, V, LDV ) + IF ( NR .GT. 1 ) + $ CALL ZLASET('L', NR-1,NR-1, CZERO,CZERO, V(2,1),LDV) +* .. the right singular vectors of R overwrite [V], the NR left +* singular vectors of R stored in [U](1:NR,1:NR) + CALL ZLASET('A', N-NR,N, CZERO,CZERO, V(NR+1,1),LDV) + CALL ZGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO ) + CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. now [V] contains the adjoint of the matrix of the right +* singular vectors of A. The leading N left singular vectors +* are in [U](1:N,1:N) +* .. assemble the left singular vector matrix U of dimensions +* (M x N1), i.e. (M x N) or (M x M). + IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL ZLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU) + IF ( N .LT. N1 ) THEN + CALL ZLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU) + CALL ZLASET( 'A',M-N,N1-N,CZERO,CONE, + $ U(N+1,N+1), LDU ) + END IF + END IF + ELSE + CALL ZLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU ) + IF ( NR .GT. 1 ) + $ CALL ZLASET('L',NR-1,NR-1,CZERO,CZERO,U(NR+2,1),LDU) + CALL ZGELQF( NR, N, U(NR+1,1), LDU, CWORK(N+1), + $ CWORK(N+NR+1), LCWORK-N-NR, IERR ) + CALL ZLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV) + IF ( NR .GT. 1 ) + $ CALL ZLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV) + CALL ZGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU, + $ V, LDV, CWORK(N+NR+1), LCWORK-N-NR, RWORK, INFO ) + CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + CALL ZUNMLQ('R','N',N,N,NR,U(NR+1,1),LDU,CWORK(N+1), + $ V, LDV, CWORK(N+NR+1),LCWORK-N-NR,IERR) + CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK ) +* .. assemble the left singular vector matrix U of dimensions +* (M x NR) or (M x N) or (M x M). + IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN + CALL ZLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1), LDU ) + END IF + END IF + END IF + END IF +* .. end of the "R**H or R" branch + END IF +* +* The Q matrix from the first QRF is built into the left singular +* vectors matrix U. +* + IF ( .NOT. WNTUF ) + $ CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LCWORK-N, IERR ) + IF ( ROWPRM .AND. .NOT.WNTUF ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 ) +* +* ... end of the "full SVD" branch + END IF +* +* Check whether some singular values are returned as zeros, e.g. +* due to underflow, and update the numerical rank. + p = NR + DO 4001 q = p, 1, -1 + IF ( S(q) .GT. ZERO ) GO TO 4002 + NR = NR - 1 + 4001 CONTINUE + 4002 CONTINUE +* +* .. if numerical rank deficiency is detected, the truncated +* singular values are set to zero. + IF ( NR .LT. N ) CALL DLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N ) +* .. undo scaling; this may cause overflow in the largest singular +* values. + IF ( ASCALED ) + $ CALL DLASCL( 'G',0,0, ONE,SQRT(DBLE(M)), NR,1, S, N, IERR ) + IF ( CONDA ) RWORK(1) = SCONDA + RWORK(2) = p - NR +* .. p-NR is the number of singular values that are computed as +* exact zeros in ZGESVD() applied to the (possibly truncated) +* full row rank triangular (trapezoidal) factor of A. + NUMRANK = NR +* + RETURN +* +* End of ZGESVDQ +* + END diff --git a/lapack-netlib/SRC/zgesvdx.f b/lapack-netlib/SRC/zgesvdx.f index 56b5cd4f2..12b20c0ba 100644 --- a/lapack-netlib/SRC/zgesvdx.f +++ b/lapack-netlib/SRC/zgesvdx.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, +* SUBROUTINE ZGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, * $ IL, IU, NS, S, U, LDU, VT, LDVT, WORK, * $ LWORK, RWORK, IWORK, INFO ) * diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f index fd32f92d8..7c25a3495 100644 --- a/lapack-netlib/SRC/zgesvj.f +++ b/lapack-netlib/SRC/zgesvj.f @@ -89,12 +89,12 @@ *> Specifies whether to compute the right singular vectors, that *> is, the matrix V: *> = 'V' or 'J': the matrix V is computed and returned in the array V -*> = 'A' : the Jacobi rotations are applied to the MV-by-N +*> = 'A': the Jacobi rotations are applied to the MV-by-N *> array V. In other words, the right singular vector *> matrix V is not computed explicitly; instead it is *> applied to an MV-by-N matrix initially stored in the *> first MV rows of V. -*> = 'N' : the matrix V is not computed and the array V is not +*> = 'N': the matrix V is not computed and the array V is not *> referenced *> \endverbatim *> @@ -116,8 +116,8 @@ *> A is COMPLEX*16 array, dimension (LDA,N) *> On entry, the M-by-N matrix A. *> On exit, -*> If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C': -*> If INFO .EQ. 0 : +*> If JOBU = 'U' .OR. JOBU = 'C': +*> If INFO = 0 : *> RANKA orthonormal columns of U are returned in the *> leading RANKA columns of the array A. Here RANKA <= N *> is the number of computed singular values of A that are @@ -127,9 +127,9 @@ *> in the array RWORK as RANKA=NINT(RWORK(2)). Also see the *> descriptions of SVA and RWORK. The computed columns of U *> are mutually numerically orthogonal up to approximately -*> TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'), +*> TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'), *> see the description of JOBU. -*> If INFO .GT. 0, +*> If INFO > 0, *> the procedure ZGESVJ did not converge in the given number *> of iterations (sweeps). In that case, the computed *> columns of U may not be orthogonal up to TOL. The output @@ -137,8 +137,8 @@ *> values in SVA(1:N)) and V is still a decomposition of the *> input matrix A in the sense that the residual *> || A - SCALE * U * SIGMA * V^* ||_2 / ||A||_2 is small. -*> If JOBU .EQ. 'N': -*> If INFO .EQ. 0 : +*> If JOBU = 'N': +*> If INFO = 0 : *> Note that the left singular vectors are 'for free' in the *> one-sided Jacobi SVD algorithm. However, if only the *> singular values are needed, the level of numerical @@ -147,7 +147,7 @@ *> numerically orthogonal up to approximately M*EPS. Thus, *> on exit, A contains the columns of U scaled with the *> corresponding singular values. -*> If INFO .GT. 0 : +*> If INFO > 0: *> the procedure ZGESVJ did not converge in the given number *> of iterations (sweeps). *> \endverbatim @@ -162,9 +162,9 @@ *> \verbatim *> SVA is DOUBLE PRECISION array, dimension (N) *> On exit, -*> If INFO .EQ. 0 : +*> If INFO = 0 : *> depending on the value SCALE = RWORK(1), we have: -*> If SCALE .EQ. ONE: +*> If SCALE = ONE: *> SVA(1:N) contains the computed singular values of A. *> During the computation SVA contains the Euclidean column *> norms of the iterated matrices in the array A. @@ -173,7 +173,7 @@ *> factored representation is due to the fact that some of the *> singular values of A might underflow or overflow. *> -*> If INFO .GT. 0 : +*> If INFO > 0: *> the procedure ZGESVJ did not converge in the given number of *> iterations (sweeps) and SCALE*SVA(1:N) may not be accurate. *> \endverbatim @@ -181,7 +181,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then the product of Jacobi rotations in ZGESVJ +*> If JOBV = 'A', then the product of Jacobi rotations in ZGESVJ *> is applied to the first MV rows of V. See the description of JOBV. *> \endverbatim *> @@ -199,16 +199,16 @@ *> \param[in] LDV *> \verbatim *> LDV is INTEGER -*> The leading dimension of the array V, LDV .GE. 1. -*> If JOBV .EQ. 'V', then LDV .GE. max(1,N). -*> If JOBV .EQ. 'A', then LDV .GE. max(1,MV) . +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V', then LDV >= max(1,N). +*> If JOBV = 'A', then LDV >= max(1,MV) . *> \endverbatim *> *> \param[in,out] CWORK *> \verbatim *> CWORK is COMPLEX*16 array, dimension (max(1,LWORK)) *> Used as workspace. -*> If on entry LWORK .EQ. -1, then a workspace query is assumed and +*> If on entry LWORK = -1, then a workspace query is assumed and *> no computation is done; CWORK(1) is set to the minial (and optimal) *> length of CWORK. *> \endverbatim @@ -223,7 +223,7 @@ *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (max(6,LRWORK)) *> On entry, -*> If JOBU .EQ. 'C' : +*> If JOBU = 'C' : *> RWORK(1) = CTOL, where CTOL defines the threshold for convergence. *> The process stops if all columns of A are mutually *> orthogonal up to CTOL*EPS, EPS=DLAMCH('E'). @@ -243,11 +243,11 @@ *> RWORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep. *> This is useful information in cases when ZGESVJ did *> not converge, as it can be used to estimate whether -*> the output is stil useful and for post festum analysis. +*> the output is still useful and for post festum analysis. *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK .EQ. -1, then a workspace query is assumed and +*> If on entry LRWORK = -1, then a workspace query is assumed and *> no computation is done; RWORK(1) is set to the minial (and optimal) *> length of RWORK. *> \endverbatim @@ -261,9 +261,9 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value -*> > 0 : ZGESVJ did not converge in the maximal allowed number +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value +*> > 0: ZGESVJ did not converge in the maximal allowed number *> (NSWEEP=30) of sweeps. The output may still be useful. *> See the description of RWORK. *> \endverbatim diff --git a/lapack-netlib/SRC/zgesvxx.f b/lapack-netlib/SRC/zgesvxx.f index c3727b70e..60bb71cd3 100644 --- a/lapack-netlib/SRC/zgesvxx.f +++ b/lapack-netlib/SRC/zgesvxx.f @@ -411,7 +411,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -447,14 +447,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -462,9 +462,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 5ce11efef..1aab3c662 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -1,3 +1,5 @@ +*> \brief \b ZGETSLS +* * Definition: * =========== * diff --git a/lapack-netlib/SRC/zggesx.f b/lapack-netlib/SRC/zggesx.f index 661523465..c546e61f1 100644 --- a/lapack-netlib/SRC/zggesx.f +++ b/lapack-netlib/SRC/zggesx.f @@ -120,10 +120,10 @@ *> \verbatim *> SENSE is CHARACTER*1 *> Determines which reciprocal condition numbers are computed. -*> = 'N' : None are computed; -*> = 'E' : Computed for average of selected eigenvalues only; -*> = 'V' : Computed for selected deflating subspaces only; -*> = 'B' : Computed for both. +*> = 'N': None are computed; +*> = 'E': Computed for average of selected eigenvalues only; +*> = 'V': Computed for selected deflating subspaces only; +*> = 'B': Computed for both. *> If SENSE = 'E', 'V', or 'B', SORT must equal 'S'. *> \endverbatim *> diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f index c4a6bd38a..ab7e31725 100644 --- a/lapack-netlib/SRC/zgsvj0.f +++ b/lapack-netlib/SRC/zgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX*16 array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multipled by a *> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim @@ -136,8 +136,8 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -157,7 +157,7 @@ *> TOL is DOUBLE PRECISION *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -175,14 +175,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f index 91e39ca8a..f0a23034b 100644 --- a/lapack-netlib/SRC/zgsvj1.f +++ b/lapack-netlib/SRC/zgsvj1.f @@ -61,7 +61,7 @@ *> In terms of the columns of A, the first N1 columns are rotated 'against' *> the remaining N-N1 columns, trying to increase the angle between the *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is -*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter. +*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter. *> The number of sweeps is given in NSWEEP and the orthogonality threshold *> is given in TOL. *> \endverbatim @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV .EQ. 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX*16 array, dimension (LDV,N) -*> If JOBV .EQ. 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multipled by a *> sequence of Jacobi rotations. -*> If JOBV .EQ. 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multipled by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim @@ -166,8 +166,8 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', LDV .GE. N. -*> If JOBV = 'A', LDV .GE. MV. +*> If JOBV = 'V', LDV >= N. +*> If JOBV = 'A', LDV >= MV. *> \endverbatim *> *> \param[in] EPS @@ -187,7 +187,7 @@ *> TOL is DOUBLE PRECISION *> TOL is the threshold for Jacobi rotations. For a pair *> A(:,p), A(:,q) of pivot columns, the Jacobi rotation is -*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL. +*> applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL. *> \endverbatim *> *> \param[in] NSWEEP @@ -205,14 +205,14 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> LWORK is the dimension of WORK. LWORK .GE. M. +*> LWORK is the dimension of WORK. LWORK >= M. *> \endverbatim *> *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0 : successful exit. -*> < 0 : if INFO = -i, then the i-th argument had an illegal value +*> = 0: successful exit. +*> < 0: if INFO = -i, then the i-th argument had an illegal value *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zhb2st_kernels.f b/lapack-netlib/SRC/zhb2st_kernels.f index a440b5c0d..2c0cb6870 100644 --- a/lapack-netlib/SRC/zhb2st_kernels.f +++ b/lapack-netlib/SRC/zhb2st_kernels.f @@ -1,26 +1,26 @@ *> \brief \b ZHB2ST_KERNELS * * @precisions fortran z -> s d c -* +* * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download ZHB2ST_KERNELS + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download ZHB2ST_KERNELS + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, +* SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, * ST, ED, SWEEP, N, NB, IB, * A, LDA, V, TAU, LDVT, WORK) * @@ -32,9 +32,9 @@ * INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. -* COMPLEX*16 A( LDA, * ), V( * ), +* COMPLEX*16 A( LDA, * ), V( * ), * TAU( * ), WORK( * ) -* +* *> \par Purpose: * ============= *> @@ -124,7 +124,7 @@ *> LDVT is INTEGER. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array. Workspace of size nb. *> \endverbatim @@ -147,7 +147,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -155,16 +155,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, + SUBROUTINE ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, $ ST, ED, SWEEP, N, NB, IB, $ A, LDA, V, TAU, LDVT, WORK) * @@ -181,7 +181,7 @@ INTEGER TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), V( * ), + COMPLEX*16 A( LDA, * ), V( * ), $ TAU( * ), WORK( * ) * .. * @@ -195,8 +195,8 @@ * .. Local Scalars .. LOGICAL UPPER INTEGER I, J1, J2, LM, LN, VPOS, TAUPOS, - $ DPOS, OFDPOS, AJETER - COMPLEX*16 CTMP + $ DPOS, OFDPOS, AJETER + COMPLEX*16 CTMP * .. * .. External Subroutines .. EXTERNAL ZLARFG, ZLARFX, ZLARFY @@ -209,7 +209,7 @@ * .. * .. * .. Executable Statements .. -* +* AJETER = IB + LDVT UPPER = LSAME( UPLO, 'U' ) @@ -240,10 +240,10 @@ V( VPOS ) = ONE DO 10 I = 1, LM-1 V( VPOS+I ) = DCONJG( A( OFDPOS-I, ST+I ) ) - A( OFDPOS-I, ST+I ) = ZERO + A( OFDPOS-I, ST+I ) = ZERO 10 CONTINUE CTMP = DCONJG( A( OFDPOS, ST ) ) - CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, + CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) A( OFDPOS, ST ) = CTMP * @@ -281,14 +281,14 @@ * V( VPOS ) = ONE DO 30 I = 1, LM-1 - V( VPOS+I ) = + V( VPOS+I ) = $ DCONJG( A( DPOS-NB-I, J1+I ) ) A( DPOS-NB-I, J1+I ) = ZERO 30 CONTINUE CTMP = DCONJG( A( DPOS-NB, J1 ) ) CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) ) A( DPOS-NB, J1 ) = CTMP -* +* CALL ZLARFX( 'Right', LN-1, LM, V( VPOS ), $ TAU( TAUPOS ), $ A( DPOS-NB+1, J1 ), LDA-1, WORK) @@ -296,9 +296,9 @@ ENDIF * * Lower case -* +* ELSE -* +* IF( WANTZ ) THEN VPOS = MOD( SWEEP-1, 2 ) * N + ST TAUPOS = MOD( SWEEP-1, 2 ) * N + ST @@ -313,9 +313,9 @@ V( VPOS ) = ONE DO 20 I = 1, LM-1 V( VPOS+I ) = A( OFDPOS+I, ST-1 ) - A( OFDPOS+I, ST-1 ) = ZERO + A( OFDPOS+I, ST-1 ) = ZERO 20 CONTINUE - CALL ZLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, + CALL ZLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * LM = ED - ST + 1 @@ -342,7 +342,7 @@ LM = J2-J1+1 * IF( LM.GT.0) THEN - CALL ZLARFX( 'Right', LM, LN, V( VPOS ), + CALL ZLARFX( 'Right', LM, LN, V( VPOS ), $ TAU( TAUPOS ), A( DPOS+NB, ST ), $ LDA-1, WORK) * @@ -359,13 +359,13 @@ V( VPOS+I ) = A( DPOS+NB+I, ST ) A( DPOS+NB+I, ST ) = ZERO 40 CONTINUE - CALL ZLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, + CALL ZLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, $ TAU( TAUPOS ) ) * - CALL ZLARFX( 'Left', LM, LN-1, V( VPOS ), + CALL ZLARFX( 'Left', LM, LN-1, V( VPOS ), $ DCONJG( TAU( TAUPOS ) ), $ A( DPOS+NB-1, ST+1 ), LDA-1, WORK) - + ENDIF ENDIF ENDIF @@ -374,4 +374,4 @@ * * END OF ZHB2ST_KERNELS * - END + END diff --git a/lapack-netlib/SRC/zhecon_3.f b/lapack-netlib/SRC/zhecon_3.f index 8c3a9f32b..9d2a240b6 100644 --- a/lapack-netlib/SRC/zhecon_3.f +++ b/lapack-netlib/SRC/zhecon_3.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE ZHECON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND, -* WORK, IWORK, INFO ) +* WORK, INFO ) * * .. Scalar Arguments .. * CHARACTER UPLO @@ -27,7 +27,7 @@ * DOUBLE PRECISION ANORM, RCOND * .. * .. Array Arguments .. -* INTEGER IPIV( * ), IWORK( * ) +* INTEGER IPIV( * ) * COMPLEX*16 A( LDA, * ), E ( * ), WORK( * ) * .. * @@ -129,11 +129,6 @@ *> WORK is COMPLEX*16 array, dimension (2*N) *> \endverbatim *> -*> \param[out] IWORK -*> \verbatim -*> IWORK is INTEGER array, dimension (N) -*> \endverbatim -*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f index 810373c83..def2d1f9d 100644 --- a/lapack-netlib/SRC/zheevr.f +++ b/lapack-netlib/SRC/zheevr.f @@ -210,7 +210,7 @@ *> eigenvalues are computed to high relative accuracy when *> possible in future releases. The current code does not *> make any guarantees about high relative accuracy, but -*> furutre releases will. See J. Barlow and J. Demmel, +*> future releases will. See J. Barlow and J. Demmel, *> "Computing Accurate Eigensystems of Scaled Diagonally *> Dominant Matrices", LAPACK Working Note #7, for a discussion *> of which matrices define their eigenvalues to high relative diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f index ab7f3374e..fe4a72160 100644 --- a/lapack-netlib/SRC/zheevr_2stage.f +++ b/lapack-netlib/SRC/zheevr_2stage.f @@ -217,7 +217,7 @@ *> eigenvalues are computed to high relative accuracy when *> possible in future releases. The current code does not *> make any guarantees about high relative accuracy, but -*> furutre releases will. See J. Barlow and J. Demmel, +*> future releases will. See J. Barlow and J. Demmel, *> "Computing Accurate Eigensystems of Scaled Diagonally *> Dominant Matrices", LAPACK Working Note #7, for a discussion *> of which matrices define their eigenvalues to high relative diff --git a/lapack-netlib/SRC/zhegs2.f b/lapack-netlib/SRC/zhegs2.f index 0bdc653b9..aec526353 100644 --- a/lapack-netlib/SRC/zhegs2.f +++ b/lapack-netlib/SRC/zhegs2.f @@ -97,6 +97,7 @@ *> B is COMPLEX*16 array, dimension (LDB,N) *> The triangular factor from the Cholesky factorization of B, *> as returned by ZPOTRF. +*> B is modified by the routine but restored on exit. *> \endverbatim *> *> \param[in] LDB diff --git a/lapack-netlib/SRC/zhegst.f b/lapack-netlib/SRC/zhegst.f index d0c08a8f6..dcf5fe8b5 100644 --- a/lapack-netlib/SRC/zhegst.f +++ b/lapack-netlib/SRC/zhegst.f @@ -97,6 +97,7 @@ *> B is COMPLEX*16 array, dimension (LDB,N) *> The triangular factor from the Cholesky factorization of B, *> as returned by ZPOTRF. +*> B is modified by the routine but restored on exit. *> \endverbatim *> *> \param[in] LDB diff --git a/lapack-netlib/SRC/zherfsx.f b/lapack-netlib/SRC/zherfsx.f index d176b102c..fa11702a8 100644 --- a/lapack-netlib/SRC/zherfsx.f +++ b/lapack-netlib/SRC/zherfsx.f @@ -102,7 +102,7 @@ *> \param[in] A *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) -*> The symmetric matrix A. If UPLO = 'U', the leading N-by-N +*> The Hermitian matrix A. If UPLO = 'U', the leading N-by-N *> upper triangular part of A contains the upper triangular *> part of the matrix A, and the strictly lower triangular *> part of A is not referenced. If UPLO = 'L', the leading @@ -270,7 +270,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -306,14 +306,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -321,9 +321,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index 8511f0e7d..5f1a9f4b3 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -42,7 +42,7 @@ *> matrices. *> *> Aasen's algorithm is used to factor A as -*> A = U * T * U**H, if UPLO = 'U', or +*> A = U**H * T * U, if UPLO = 'U', or *> A = L * T * L**H, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is Hermitian and tridiagonal. The factored form @@ -86,7 +86,7 @@ *> *> On exit, if INFO = 0, the tridiagonal matrix T and the *> multipliers used to obtain the factor U or L from the -*> factorization A = U*T*U**H or A = L*T*L**H as computed by +*> factorization A = U**H*T*U or A = L*T*L**H as computed by *> ZHETRF_AA. *> \endverbatim *> @@ -230,7 +230,7 @@ RETURN END IF * -* Compute the factorization A = U*T*U**H or A = L*T*L**H. +* Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) IF( INFO.EQ.0 ) THEN diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index ed221dc69..7a4e35f45 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -44,7 +44,7 @@ *> matrices. *> *> Aasen's 2-stage algorithm is used to factor A as -*> A = U * T * U**H, if UPLO = 'U', or +*> A = U**H * T * U, if UPLO = 'U', or *> A = L * T * L**H, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is Hermitian and band. The matrix T is @@ -211,9 +211,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER I, J, K, I1, I2, TD - INTEGER LDTB, LWKOPT, NB, KB, NT, IINFO - COMPLEX PIV + INTEGER LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -263,7 +261,7 @@ RETURN END IF * -* Compute the factorization A = U*T*U**H or A = L*T*L**H. +* Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, $ WORK, LWORK, INFO ) diff --git a/lapack-netlib/SRC/zhesvxx.f b/lapack-netlib/SRC/zhesvxx.f index 375fc072d..20168185c 100644 --- a/lapack-netlib/SRC/zhesvxx.f +++ b/lapack-netlib/SRC/zhesvxx.f @@ -46,7 +46,7 @@ *> *> ZHESVXX uses the diagonal pivoting factorization to compute the *> solution to a complex*16 system of linear equations A * X = B, where -*> A is an N-by-N symmetric matrix and X and B are N-by-NRHS +*> A is an N-by-N Hermitian matrix and X and B are N-by-NRHS *> matrices. *> *> If requested, both normwise and maximum componentwise error bounds @@ -88,7 +88,7 @@ *> A = L * D * L**T, if UPLO = 'L', *> *> where U (or L) is a product of permutation and unit upper (lower) -*> triangular matrices, and D is symmetric and block diagonal with +*> triangular matrices, and D is Hermitian and block diagonal with *> 1-by-1 and 2-by-2 diagonal blocks. *> *> 3. If some D(i,i)=0, so that D is exactly singular, then the @@ -161,7 +161,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) -*> The symmetric matrix A. If UPLO = 'U', the leading N-by-N +*> The Hermitian matrix A. If UPLO = 'U', the leading N-by-N *> upper triangular part of A contains the upper triangular *> part of the matrix A, and the strictly lower triangular *> part of A is not referenced. If UPLO = 'L', the leading @@ -378,7 +378,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -414,14 +414,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -429,9 +429,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/zhetf2_rk.f b/lapack-netlib/SRC/zhetf2_rk.f index 84d3a0248..6578214df 100644 --- a/lapack-netlib/SRC/zhetf2_rk.f +++ b/lapack-netlib/SRC/zhetf2_rk.f @@ -322,7 +322,7 @@ * * Factorize A as U*D*U**H using the upper triangle of A * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -676,7 +676,7 @@ * * Factorize A as L*D*L**H using the lower triangle of A * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f index 9d6a426a3..1a2c00a2f 100644 --- a/lapack-netlib/SRC/zhetrd_2stage.f +++ b/lapack-netlib/SRC/zhetrd_2stage.f @@ -123,23 +123,22 @@ *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX*16 array, dimension LHOUS2, that -*> store the Householder representation of the stage2 +*> HOUS2 is COMPLEX*16 array, dimension (LHOUS2) +*> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim *> *> \param[in] LHOUS2 *> \verbatim *> LHOUS2 is INTEGER -*> The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array HOUS2. +*> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns *> this value as the first entry of the HOUS2 array, and no error *> message related to LHOUS2 is issued by XERBLA. -*> LHOUS2 = MAX(1, dimension) where -*> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> If VECT='N', LHOUS2 = max(1, 4*n); +*> if VECT='V', option not yet available. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 86122cccc..4ba7bfc21 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -50,9 +50,9 @@ * Arguments: * ========== * -*> \param[in] STAGE +*> \param[in] STAGE1 *> \verbatim -*> STAGE is CHARACTER*1 +*> STAGE1 is CHARACTER*1 *> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the zhetrd_he2hb routine *> was not called before this routine to reproduce AB. diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e33bf4b2b..b85b3889a 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -363,7 +363,7 @@ * * * Set the workspace of the triangular matrix T to zero once such a -* way everytime T is generated the upper/lower portion will be always zero +* way every time T is generated the upper/lower portion will be always zero * CALL ZLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT ) * diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f index e355aed14..b80a84118 100644 --- a/lapack-netlib/SRC/zhetrf_aa.f +++ b/lapack-netlib/SRC/zhetrf_aa.f @@ -37,7 +37,7 @@ *> ZHETRF_AA computes the factorization of a complex hermitian matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**H or A = L*T*L**H +*> A = U**H*T*U or A = L*T*L**H *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a hermitian tridiagonal matrix. @@ -223,7 +223,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**H using the upper triangle of A +* Factorize A as U**H*D*U using the upper triangle of A * ..................................................... * * copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N)) @@ -256,7 +256,7 @@ $ A( MAX(1, J), J+1 ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J @@ -376,7 +376,7 @@ $ A( J+1, MAX(1, J) ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 73c0ebe9a..f63713664 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -38,7 +38,7 @@ *> ZHETRF_AA_2STAGE computes the factorization of a double hermitian matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**H*T*U or A = L*T*L**H *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a hermitian band matrix with the @@ -66,7 +66,7 @@ *> *> \param[in,out] A *> \verbatim -*> A is COMPLEX array, dimension (LDA,N) +*> A is COMPLEX*16 array, dimension (LDA,N) *> On entry, the hermitian matrix A. If UPLO = 'U', the leading *> N-by-N upper triangular part of A contains the upper *> triangular part of the matrix A, and the strictly lower @@ -87,7 +87,7 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (LTB) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> @@ -121,7 +121,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size LWORK *> \endverbatim *> *> \param[in] LWORK @@ -276,7 +276,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**H*D*U using the upper triangle of A * ..................................................... * DO J = 0, NT-1 @@ -452,14 +452,17 @@ c END IF * > Apply pivots to previous columns of L CALL ZSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, $ A( (J+1)*NB+1, I2 ), 1 ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA, - $ A( I1+1, I2 ), 1 ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) THEN + CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA, + $ A( I1+1, I2 ), 1 ) + CALL ZLACGV( I2-I1-1, A( I1+1, I2 ), 1 ) + END IF CALL ZLACGV( I2-I1, A( I1, I1+1 ), LDA ) - CALL ZLACGV( I2-I1-1, A( I1+1, I2 ), 1 ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA, - $ A( I2, I2+1 ), LDA ) + IF( I2.LT.N ) + $ CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA, + $ A( I2, I2+1 ), LDA ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) @@ -476,7 +479,7 @@ c END IF ELSE * * ..................................................... -* Factorize A as L*D*L**T using the lower triangle of A +* Factorize A as L*D*L**H using the lower triangle of A * ..................................................... * DO J = 0, NT-1 @@ -629,14 +632,17 @@ c END IF * > Apply pivots to previous columns of L CALL ZSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, $ A( I2, (J+1)*NB+1 ), LDA ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1, - $ A( I2, I1+1 ), LDA ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) THEN + CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1, + $ A( I2, I1+1 ), LDA ) + CALL ZLACGV( I2-I1-1, A( I2, I1+1 ), LDA ) + END IF CALL ZLACGV( I2-I1, A( I1+1, I1 ), 1 ) - CALL ZLACGV( I2-I1-1, A( I2, I1+1 ), LDA ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL ZSWAP( N-I2, A( I2+1, I1 ), 1, - $ A( I2+1, I2 ), 1 ) + IF( I2.LT.N ) + $ CALL ZSWAP( N-I2, A( I2+1, I1 ), 1, + $ A( I2+1, I2 ), 1 ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f index a7acff49f..ae43b14fe 100644 --- a/lapack-netlib/SRC/zhetri2.f +++ b/lapack-netlib/SRC/zhetri2.f @@ -62,7 +62,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers +*> On entry, the block diagonal matrix D and the multipliers *> used to obtain the factor U or L as computed by ZHETRF. *> *> On exit, if INFO = 0, the (symmetric) inverse of the original @@ -82,7 +82,7 @@ *> \param[in] IPIV *> \verbatim *> IPIV is INTEGER array, dimension (N) -*> Details of the interchanges and the NB structure of D +*> Details of the interchanges and the block structure of D *> as determined by ZHETRF. *> \endverbatim *> diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f index 9d302b9cd..4b3253abc 100644 --- a/lapack-netlib/SRC/zhetrs_aa.f +++ b/lapack-netlib/SRC/zhetrs_aa.f @@ -38,8 +38,8 @@ *> \verbatim *> *> ZHETRS_AA solves a system of linear equations A*X = B with a complex -*> hermitian matrix A using the factorization A = U*T*U**H or -*> A = L*T*L**T computed by ZHETRF_AA. +*> hermitian matrix A using the factorization A = U**H*T*U or +*> A = L*T*L**H computed by ZHETRF_AA. *> \endverbatim * * Arguments: @@ -50,7 +50,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**H; +*> = 'U': Upper triangular, form is A = U**H*T*U; *> = 'L': Lower triangular, form is A = L*T*L**H. *> \endverbatim *> @@ -98,14 +98,16 @@ *> The leading dimension of the array B. LDB >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim -*> WORK is DOUBLE array, dimension (MAX(1,LWORK)) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER, LWORK >= MAX(1,3*N-2). +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -199,61 +201,80 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**H*T*U. * -* Pivot, P**T * B +* 1) Forward substitution with U**H * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + IF( N.GT.1 ) THEN * -* Compute (U \P**T * B) -> B [ (U \P**T * B) ] +* Pivot, P**T * B -> B * - CALL ZTRSM('L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO * -* Compute T \ B -> B [ T \ (U \P**T * B) ] +* Compute U**H \ B -> B [ (U**H \P**T * B) ] * - CALL ZLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1) + CALL ZTRSM( 'L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB ) + END IF +* +* 2) Solve with triangular matrix T +* +* Compute T \ B -> B [ T \ (U**H \P**T * B) ] +* + CALL ZLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1 ) IF( N.GT.1 ) THEN CALL ZLACPY( 'F', 1, N-1, A( 1, 2 ), LDA+1, WORK( 2*N ), 1) - CALL ZLACPY( 'F', 1, N-1, A( 1, 2 ), LDA+1, WORK( 1 ), 1) + CALL ZLACPY( 'F', 1, N-1, A( 1, 2 ), LDA+1, WORK( 1 ), 1 ) CALL ZLACGV( N-1, WORK( 1 ), 1 ) END IF - CALL ZGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB, - $ INFO) + CALL ZGTSV( N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB, + $ INFO ) +* +* 3) Backward substitution with U +* + IF( N.GT.1 ) THEN * -* Compute (U**T \ B) -> B [ U**T \ (T \ (U \P**T * B) ) ] +* Compute U \ B -> B [ U \ (T \ (U**H \P**T * B) ) ] * - CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B(2, 1), LDB) + CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B(2, 1), LDB) * -* Pivot, P * B [ P * (U**T \ (T \ (U \P**T * B) )) ] +* Pivot, P * B [ P * (U**H \ (T \ (U \P**T * B) )) ] * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * ELSE * -* Solve A*X = B, where A = L*T*L**T. +* Solve A*X = B, where A = L*T*L**H. +* +* 1) Forward substitution with L * -* Pivot, P**T * B + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute L \ B -> B [ (L \P**T * B) ] * - CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B(2, 1), LDB) + CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B(2, 1), LDB) + END IF +* +* 2) Solve with triangular matrix T * * Compute T \ B -> B [ T \ (L \P**T * B) ] * @@ -266,18 +287,23 @@ CALL ZGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB, $ INFO) * -* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] +* 3) Backward substitution with L**H +* + IF( N.GT.1 ) THEN * - CALL ZTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) +* Compute L**H \ B -> B [ L**H \ (T \ (L \P**T * B) ) ] * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] + CALL ZTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Pivot, P * B [ P * (L**H \ (T \ (L \P**T * B) )) ] +* + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * END IF * diff --git a/lapack-netlib/SRC/zhetrs_aa_2stage.f b/lapack-netlib/SRC/zhetrs_aa_2stage.f index 7fcee1118..c621bd571 100644 --- a/lapack-netlib/SRC/zhetrs_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrs_aa_2stage.f @@ -38,8 +38,8 @@ *> \verbatim *> *> ZHETRS_AA_2STAGE solves a system of linear equations A*X = B with a -*> hermitian matrix A using the factorization A = U*T*U**T or -*> A = L*T*L**T computed by ZHETRF_AA_2STAGE. +*> hermitian matrix A using the factorization A = U**H*T*U or +*> A = L*T*L**H computed by ZHETRF_AA_2STAGE. *> \endverbatim * * Arguments: @@ -50,8 +50,8 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; -*> = 'L': Lower triangular, form is A = L*T*L**T. +*> = 'U': Upper triangular, form is A = U**H*T*U; +*> = 'L': Lower triangular, form is A = L*T*L**H. *> \endverbatim *> *> \param[in] N @@ -210,33 +210,33 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**H*T*U. * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (U**T \P**T * B) -> B [ (U**T \P**T * B) ] +* Compute (U**H \ B) -> B [ (U**H \P**T * B) ] * CALL ZTRSM( 'L', 'U', 'C', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) * END IF * -* Compute T \ B -> B [ T \ (U**T \P**T * B) ] +* Compute T \ B -> B [ T \ (U**H \P**T * B) ] * CALL ZGBTRS( 'N', N, NB, NB, NRHS, TB, LDTB, IPIV2, B, LDB, $ INFO) IF( N.GT.NB ) THEN * -* Compute (U \ B) -> B [ U \ (T \ (U**T \P**T * B) ) ] +* Compute (U \ B) -> B [ U \ (T \ (U**H \P**T * B) ) ] * CALL ZTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (U \ (T \ (U**T \P**T * B) )) ] +* Pivot, P * B -> B [ P * (U \ (T \ (U**H \P**T * B) )) ] * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * @@ -244,15 +244,15 @@ * ELSE * -* Solve A*X = B, where A = L*T*L**T. +* Solve A*X = B, where A = L*T*L**H. * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute (L \ B) -> B [ (L \P**T * B) ] * CALL ZTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) @@ -265,12 +265,12 @@ $ INFO) IF( N.GT.NB ) THEN * -* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] +* Compute (L**H \ B) -> B [ L**H \ (T \ (L \P**T * B) ) ] * CALL ZTRSM( 'L', 'L', 'C', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* Pivot, P * B -> B [ P * (L**H \ (T \ (L \P**T * B) )) ] * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f index 1e8134c39..2ee874dfd 100644 --- a/lapack-netlib/SRC/zhseqr.f +++ b/lapack-netlib/SRC/zhseqr.f @@ -69,7 +69,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -86,7 +86,7 @@ *> set by a previous call to ZGEBAL, and then passed to ZGEHRD *> when the matrix output by ZGEBAL is reduced to Hessenberg *> form. Otherwise ILO and IHI should be set to 1 and N -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -98,17 +98,17 @@ *> triangular matrix T from the Schur decomposition (the *> Schur form). If INFO = 0 and JOB = 'E', the contents of *> H are unspecified on exit. (The output value of H when -*> INFO.GT.0 is given under the description of INFO below.) +*> INFO > 0 is given under the description of INFO below.) *> *> Unlike earlier versions of ZHSEQR, this subroutine may -*> explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1 +*> explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1 *> or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] W @@ -131,7 +131,7 @@ *> if INFO = 0, Z contains Q*Z. *> Normally Q is the unitary matrix generated by ZUNGHR *> after the call to ZGEHRD which formed the Hessenberg matrix -*> H. (The output value of Z when INFO.GT.0 is given under +*> H. (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -139,7 +139,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if COMPZ = 'I' or -*> COMPZ = 'V', then LDZ.GE.MAX(1,N). Otherwize, LDZ.GE.1. +*> COMPZ = 'V', then LDZ >= MAX(1,N). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -152,7 +152,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient and delivers very good and sometimes *> optimal performance. However, LWORK as large as 11*N *> may be required for optimal performance. A workspace @@ -170,21 +170,21 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .LT. 0: if INFO = -i, the i-th argument had an illegal +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal *> value -*> .GT. 0: if INFO = i, ZHSEQR failed to compute all of -*> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR -*> and WI contain those eigenvalues which have been +*> > 0: if INFO = i, ZHSEQR failed to compute all of +*> the eigenvalues. Elements 1:ilo-1 and i+1:n of W +*> contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and JOB = 'E', then on exit, the +*> If INFO > 0 and JOB = 'E', then on exit, the *> remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and JOB = 'S', then on exit +*> If INFO > 0 and JOB = 'S', then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -192,19 +192,19 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and COMPZ = 'V', then on exit +*> If INFO > 0 and COMPZ = 'V', then on exit *> *> (final value of Z) = (initial value of Z)*U *> *> where U is the unitary matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'I', then on exit +*> If INFO > 0 and COMPZ = 'I', then on exit *> (final value of Z) = U *> where U is the unitary matrix in (*) (regard- *> less of the value of JOB.) *> -*> If INFO .GT. 0 and COMPZ = 'N', then Z is not +*> If INFO > 0 and COMPZ = 'N', then Z is not *> accessed. *> \endverbatim * @@ -244,8 +244,8 @@ *> This depends on ILO, IHI and NS. NS is the *> number of simultaneous shifts returned *> by ILAENV(ISPEC=15). (See ISPEC=15 below.) -*> The default for (IHI-ILO+1).LE.500 is NS. -*> The default for (IHI-ILO+1).GT.500 is 3*NS/2. +*> The default for (IHI-ILO+1) <= 500 is NS. +*> The default for (IHI-ILO+1) > 500 is 3*NS/2. *> *> ISPEC=14: Nibble crossover point. (See IPARMQ for *> details.) Default: 14% of deflation window @@ -323,8 +323,8 @@ PARAMETER ( NTINY = 11 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare ZLAHQR failure. NL .GT. NTINY = 11 is -* . required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom- +* . through a rare ZLAHQR failure. NL > NTINY = 11 is +* . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 * . deflation window. ==== diff --git a/lapack-netlib/SRC/zla_gbrcond_c.f b/lapack-netlib/SRC/zla_gbrcond_c.f index 20109124b..5b2dc46fc 100644 --- a/lapack-netlib/SRC/zla_gbrcond_c.f +++ b/lapack-netlib/SRC/zla_gbrcond_c.f @@ -133,13 +133,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_gbrcond_x.f b/lapack-netlib/SRC/zla_gbrcond_x.f index 7e6c12ea5..17e9eede7 100644 --- a/lapack-netlib/SRC/zla_gbrcond_x.f +++ b/lapack-netlib/SRC/zla_gbrcond_x.f @@ -126,13 +126,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.f b/lapack-netlib/SRC/zla_gbrfsx_extended.f index 7a850f1aa..a22b5592e 100644 --- a/lapack-netlib/SRC/zla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/zla_gbrfsx_extended.f @@ -65,19 +65,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -269,7 +269,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/zla_gercond_c.f b/lapack-netlib/SRC/zla_gercond_c.f index e629f90e8..a1c0df588 100644 --- a/lapack-netlib/SRC/zla_gercond_c.f +++ b/lapack-netlib/SRC/zla_gercond_c.f @@ -22,7 +22,7 @@ * LDAF, IPIV, C, CAPPLY, * INFO, WORK, RWORK ) * -* .. Scalar Aguments .. +* .. Scalar Arguments .. * CHARACTER TRANS * LOGICAL CAPPLY * INTEGER N, LDA, LDAF, INFO @@ -114,13 +114,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. @@ -148,7 +148,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * -* .. Scalar Aguments .. +* .. Scalar Arguments .. CHARACTER TRANS LOGICAL CAPPLY INTEGER N, LDA, LDAF, INFO diff --git a/lapack-netlib/SRC/zla_gercond_x.f b/lapack-netlib/SRC/zla_gercond_x.f index 244bf58a3..3aa63ea84 100644 --- a/lapack-netlib/SRC/zla_gercond_x.f +++ b/lapack-netlib/SRC/zla_gercond_x.f @@ -107,13 +107,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.f b/lapack-netlib/SRC/zla_gerfsx_extended.f index 2e93e265e..e42ffa8e2 100644 --- a/lapack-netlib/SRC/zla_gerfsx_extended.f +++ b/lapack-netlib/SRC/zla_gerfsx_extended.f @@ -64,19 +64,19 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] TRANS_TYPE *> \verbatim *> TRANS_TYPE is INTEGER *> Specifies the transposition operation on A. -*> The value is defined by ILATRANS(T) where T is a CHARACTER and -*> T = 'N': No transpose +*> The value is defined by ILATRANS(T) where T is a CHARACTER and T +*> = 'N': No transpose *> = 'T': Transpose *> = 'C': Conjugate transpose *> \endverbatim @@ -256,7 +256,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERRS_C is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERRS_C is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERRS_C(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/zla_hercond_c.f b/lapack-netlib/SRC/zla_hercond_c.f index 61cfe95f1..7c933cc3c 100644 --- a/lapack-netlib/SRC/zla_hercond_c.f +++ b/lapack-netlib/SRC/zla_hercond_c.f @@ -111,13 +111,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_hercond_x.f b/lapack-netlib/SRC/zla_hercond_x.f index 9c19b487d..ee283c0b5 100644 --- a/lapack-netlib/SRC/zla_hercond_x.f +++ b/lapack-netlib/SRC/zla_hercond_x.f @@ -104,13 +104,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_herfsx_extended.f b/lapack-netlib/SRC/zla_herfsx_extended.f index 5b43a58b9..8329080ef 100644 --- a/lapack-netlib/SRC/zla_herfsx_extended.f +++ b/lapack-netlib/SRC/zla_herfsx_extended.f @@ -66,11 +66,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -254,7 +254,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/zla_herpvgrw.f b/lapack-netlib/SRC/zla_herpvgrw.f index 557d6e830..d414c371f 100644 --- a/lapack-netlib/SRC/zla_herpvgrw.f +++ b/lapack-netlib/SRC/zla_herpvgrw.f @@ -102,7 +102,7 @@ *> as determined by ZHETRF. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/zla_porcond_c.f b/lapack-netlib/SRC/zla_porcond_c.f index a74295b41..2e591dd09 100644 --- a/lapack-netlib/SRC/zla_porcond_c.f +++ b/lapack-netlib/SRC/zla_porcond_c.f @@ -103,13 +103,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_porcond_x.f b/lapack-netlib/SRC/zla_porcond_x.f index 0b2c84f42..4f409544f 100644 --- a/lapack-netlib/SRC/zla_porcond_x.f +++ b/lapack-netlib/SRC/zla_porcond_x.f @@ -96,13 +96,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_porfsx_extended.f b/lapack-netlib/SRC/zla_porfsx_extended.f index 85dd42780..169a9a5d4 100644 --- a/lapack-netlib/SRC/zla_porfsx_extended.f +++ b/lapack-netlib/SRC/zla_porfsx_extended.f @@ -65,11 +65,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -246,7 +246,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/zla_porpvgrw.f b/lapack-netlib/SRC/zla_porpvgrw.f index cd71635ec..f669b2864 100644 --- a/lapack-netlib/SRC/zla_porpvgrw.f +++ b/lapack-netlib/SRC/zla_porpvgrw.f @@ -86,7 +86,7 @@ *> The leading dimension of the array AF. LDAF >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/zla_syrcond_c.f b/lapack-netlib/SRC/zla_syrcond_c.f index be9d14bd0..ff44d6c3b 100644 --- a/lapack-netlib/SRC/zla_syrcond_c.f +++ b/lapack-netlib/SRC/zla_syrcond_c.f @@ -111,13 +111,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_syrcond_x.f b/lapack-netlib/SRC/zla_syrcond_x.f index 2d0269092..53022bbfb 100644 --- a/lapack-netlib/SRC/zla_syrcond_x.f +++ b/lapack-netlib/SRC/zla_syrcond_x.f @@ -104,13 +104,13 @@ *> i > 0: The ith argument is invalid. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is COMPLEX*16 array, dimension (2*N). *> Workspace. *> \endverbatim *> -*> \param[in] RWORK +*> \param[out] RWORK *> \verbatim *> RWORK is DOUBLE PRECISION array, dimension (N). *> Workspace. diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.f b/lapack-netlib/SRC/zla_syrfsx_extended.f index a9716fd23..69844c94b 100644 --- a/lapack-netlib/SRC/zla_syrfsx_extended.f +++ b/lapack-netlib/SRC/zla_syrfsx_extended.f @@ -66,11 +66,11 @@ *> \verbatim *> PREC_TYPE is INTEGER *> Specifies the intermediate precision to be used in refinement. -*> The value is defined by ILAPREC(P) where P is a CHARACTER and -*> P = 'S': Single +*> The value is defined by ILAPREC(P) where P is a CHARACTER and P +*> = 'S': Single *> = 'D': Double *> = 'I': Indigenous -*> = 'X', 'E': Extra +*> = 'X' or 'E': Extra *> \endverbatim *> *> \param[in] UPLO @@ -254,7 +254,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith diff --git a/lapack-netlib/SRC/zla_syrpvgrw.f b/lapack-netlib/SRC/zla_syrpvgrw.f index ccf4fc2d6..82c9f52f8 100644 --- a/lapack-netlib/SRC/zla_syrpvgrw.f +++ b/lapack-netlib/SRC/zla_syrpvgrw.f @@ -102,7 +102,7 @@ *> as determined by ZSYTRF. *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim *> WORK is DOUBLE PRECISION array, dimension (2*N) *> \endverbatim diff --git a/lapack-netlib/SRC/zla_wwaddw.f b/lapack-netlib/SRC/zla_wwaddw.f index b4f9df332..f06113a95 100644 --- a/lapack-netlib/SRC/zla_wwaddw.f +++ b/lapack-netlib/SRC/zla_wwaddw.f @@ -36,7 +36,7 @@ *> ZLA_WWADDW adds a vector W into a doubled-single vector (X, Y). *> *> This works for all extant IBM's hex and binary floating point -*> arithmetics, but not for decimal. +*> arithmetic, but not for decimal. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zlahef_aa.f b/lapack-netlib/SRC/zlahef_aa.f index 8bad4aba9..ddd1e9493 100644 --- a/lapack-netlib/SRC/zlahef_aa.f +++ b/lapack-netlib/SRC/zlahef_aa.f @@ -288,8 +288,9 @@ * * Swap A(I1, I2+1:N) with A(I2, I2+1:N) * - CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, - $ A( J1+I2-1, I2+1 ), LDA ) + IF( I2.LT.M ) + $ CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, + $ A( J1+I2-1, I2+1 ), LDA ) * * Swap A(I1, I1) with A(I2,I2) * @@ -329,13 +330,15 @@ * Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1) * - IF( A( K, J+1 ).NE.ZERO ) THEN - ALPHA = ONE / A( K, J+1 ) - CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) - CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) - ELSE - CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO, - $ A( K, J+2 ), LDA) + IF( J.LT.(M-1) ) THEN + IF( A( K, J+1 ).NE.ZERO ) THEN + ALPHA = ONE / A( K, J+1 ) + CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) + CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) + ELSE + CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO, + $ A( K, J+2 ), LDA) + END IF END IF END IF J = J + 1 @@ -440,8 +443,9 @@ * * Swap A(I2+1:N, I1) with A(I2+1:N, I2) * - CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, - $ A( I2+1, J1+I2-1 ), 1 ) + IF( I2.LT.M ) + $ CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, + $ A( I2+1, J1+I2-1 ), 1 ) * * Swap A(I1, I1) with A(I2, I2) * @@ -481,13 +485,15 @@ * Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1) * - IF( A( J+1, K ).NE.ZERO ) THEN - ALPHA = ONE / A( J+1, K ) - CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) - CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) - ELSE - CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO, - $ A( J+2, K ), LDA ) + IF( J.LT.(M-1) ) THEN + IF( A( J+1, K ).NE.ZERO ) THEN + ALPHA = ONE / A( J+1, K ) + CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) + CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) + ELSE + CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO, + $ A( J+2, K ), LDA ) + END IF END IF END IF J = J + 1 diff --git a/lapack-netlib/SRC/zlahef_rk.f b/lapack-netlib/SRC/zlahef_rk.f index d8d54f4ce..6a8549cf5 100644 --- a/lapack-netlib/SRC/zlahef_rk.f +++ b/lapack-netlib/SRC/zlahef_rk.f @@ -331,7 +331,7 @@ * Factorize the trailing columns of A using the upper triangle * of A and working backwards, and compute the matrix W = U12*D * for use in updating A11 (note that conjg(W) is actually stored) -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -789,7 +789,7 @@ * of A and working forwards, and compute the matrix W = L21*D * for use in updating A22 (note that conjg(W) is actually stored) * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/zlahqr.f b/lapack-netlib/SRC/zlahqr.f index 19015b3fa..0a8318874 100644 --- a/lapack-netlib/SRC/zlahqr.f +++ b/lapack-netlib/SRC/zlahqr.f @@ -138,26 +138,26 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, ZLAHQR failed to compute all the +*> = 0: successful exit +*> > 0: if INFO = i, ZLAHQR failed to compute all the *> eigenvalues ILO to IHI in a total of 30 iterations *> per eigenvalue; elements i+1:ihi of W contain *> those eigenvalues which have been successfully *> computed. *> -*> If INFO .GT. 0 and WANTT is .FALSE., then on exit, +*> If INFO > 0 and WANTT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the *> eigenvalues of the upper Hessenberg matrix -*> rows and columns ILO thorugh INFO of the final, +*> rows and columns ILO through INFO of the final, *> output value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> (*) (initial value of H)*U = U*(final value of H) -*> where U is an orthognal matrix. The final +*> where U is an orthogonal matrix. The final *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> (final value of Z) = (initial value of Z)*U *> where U is the orthogonal matrix in (*) *> (regardless of the value of WANTT.) diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f index 0e0b0a1da..f32f5667c 100644 --- a/lapack-netlib/SRC/zlamswlq.f +++ b/lapack-netlib/SRC/zlamswlq.f @@ -1,3 +1,4 @@ +*> \brief \b ZLAMSWLQ * * Definition: * =========== diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f index 1ee732425..034c45505 100644 --- a/lapack-netlib/SRC/zlamtsqr.f +++ b/lapack-netlib/SRC/zlamtsqr.f @@ -1,3 +1,4 @@ +*> \brief \b ZLAMTSQR * * Definition: * =========== diff --git a/lapack-netlib/SRC/zlangb.f b/lapack-netlib/SRC/zlangb.f index 949bb2c01..e40a470fd 100644 --- a/lapack-netlib/SRC/zlangb.f +++ b/lapack-netlib/SRC/zlangb.f @@ -130,6 +130,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER KL, KU, LDAB, N @@ -147,14 +148,17 @@ * .. * .. Local Scalars .. INTEGER I, J, K, L - DOUBLE PRECISION SCALE, SUM, VALUE, TEMP + DOUBLE PRECISION SUM, VALUE, TEMP +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -207,15 +211,22 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N L = MAX( 1, J-KU ) K = KU + 1 - J + L - CALL ZLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANGB = VALUE diff --git a/lapack-netlib/SRC/zlange.f b/lapack-netlib/SRC/zlange.f index 5407decef..8162786fb 100644 --- a/lapack-netlib/SRC/zlange.f +++ b/lapack-netlib/SRC/zlange.f @@ -120,6 +120,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, M, N @@ -137,14 +138,17 @@ * .. * .. Local Scalars .. INTEGER I, J - DOUBLE PRECISION SCALE, SUM, VALUE, TEMP + DOUBLE PRECISION SUM, VALUE, TEMP +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT @@ -196,13 +200,19 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL ZLASSQ( M, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANGE = VALUE diff --git a/lapack-netlib/SRC/zlanhb.f b/lapack-netlib/SRC/zlanhb.f index b3717804f..16b5c117c 100644 --- a/lapack-netlib/SRC/zlanhb.f +++ b/lapack-netlib/SRC/zlanhb.f @@ -137,6 +137,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER K, LDAB, N @@ -154,14 +155,17 @@ * .. * .. Local Scalars .. INTEGER I, J, L - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, SQRT @@ -233,39 +237,57 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( K.GT.0 ) THEN IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL ZLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE L = K + 1 ELSE DO 120 J = 1, N - 1 - CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE L = 1 END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) ELSE L = 1 END IF +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 J = 1, N IF( DBLE( AB( L, J ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( AB( L, J ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANHB = VALUE diff --git a/lapack-netlib/SRC/zlanhe.f b/lapack-netlib/SRC/zlanhe.f index 7c7f7f3be..5aef9a756 100644 --- a/lapack-netlib/SRC/zlanhe.f +++ b/lapack-netlib/SRC/zlanhe.f @@ -129,6 +129,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER LDA, N @@ -146,14 +147,17 @@ * .. * .. Local Scalars .. INTEGER I, J - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, SQRT @@ -223,31 +227,48 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL ZLASSQ( J-1, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( J-1, A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL ZLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N-J, A( J+1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* DO 130 I = 1, N IF( DBLE( A( I, I ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( A( I, I ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( SSQ( 1 ).LT.ABSA ) THEN + SSQ( 2 ) = ONE + SSQ( 2 )*( SSQ( 1 ) / ABSA )**2 + SSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + SSQ( 2 ) = SSQ( 2 ) + ( ABSA / SSQ( 1 ) )**2 END IF END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANHE = VALUE diff --git a/lapack-netlib/SRC/zlanhp.f b/lapack-netlib/SRC/zlanhp.f index 9ded60746..d795aeca9 100644 --- a/lapack-netlib/SRC/zlanhp.f +++ b/lapack-netlib/SRC/zlanhp.f @@ -122,6 +122,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER N @@ -139,14 +140,17 @@ * .. * .. Local Scalars .. INTEGER I, J, K - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, SQRT @@ -225,31 +229,48 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE K = 2 IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL ZLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + J 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL ZLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* K = 1 + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 I = 1, N IF( DBLE( AP( K ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( AP( K ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( LSAME( UPLO, 'U' ) ) THEN @@ -258,7 +279,8 @@ K = K + N - I + 1 END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANHP = VALUE diff --git a/lapack-netlib/SRC/zlanhs.f b/lapack-netlib/SRC/zlanhs.f index f2d36b304..bd8e86be9 100644 --- a/lapack-netlib/SRC/zlanhs.f +++ b/lapack-netlib/SRC/zlanhs.f @@ -114,6 +114,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM INTEGER LDA, N @@ -131,14 +132,17 @@ * .. * .. Local Scalars .. INTEGER I, J - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT @@ -190,13 +194,20 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 90 J = 1, N - CALL ZLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( N, J+1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 90 CONTINUE - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANHS = VALUE diff --git a/lapack-netlib/SRC/zlansb.f b/lapack-netlib/SRC/zlansb.f index 3468c49b3..245dcaf4b 100644 --- a/lapack-netlib/SRC/zlansb.f +++ b/lapack-netlib/SRC/zlansb.f @@ -135,6 +135,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER K, LDAB, N @@ -152,14 +153,17 @@ * .. * .. Local Scalars .. INTEGER I, J, L - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -227,29 +231,47 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( K.GT.0 ) THEN IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL ZLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE L = K + 1 ELSE DO 120 J = 1, N - 1 - CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE L = 1 END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) ELSE L = 1 END IF - CALL ZLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANSB = VALUE diff --git a/lapack-netlib/SRC/zlansp.f b/lapack-netlib/SRC/zlansp.f index 84fb972bb..fa9220487 100644 --- a/lapack-netlib/SRC/zlansp.f +++ b/lapack-netlib/SRC/zlansp.f @@ -120,6 +120,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER N @@ -137,14 +138,17 @@ * .. * .. Local Scalars .. INTEGER I, J, K - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, SQRT @@ -219,40 +223,57 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE K = 2 IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL ZLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + J 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL ZLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 120 CONTINUE END IF - SUM = 2*SUM + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* K = 1 + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE DO 130 I = 1, N IF( DBLE( AP( K ) ).NE.ZERO ) THEN ABSA = ABS( DBLE( AP( K ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( DIMAG( AP( K ) ).NE.ZERO ) THEN ABSA = ABS( DIMAG( AP( K ) ) ) - IF( SCALE.LT.ABSA ) THEN - SUM = ONE + SUM*( SCALE / ABSA )**2 - SCALE = ABSA + IF( COLSSQ( 1 ).LT.ABSA ) THEN + COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2 + COLSSQ( 1 ) = ABSA ELSE - SUM = SUM + ( ABSA / SCALE )**2 + COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2 END IF END IF IF( LSAME( UPLO, 'U' ) ) THEN @@ -261,7 +282,8 @@ K = K + N - I + 1 END IF 130 CONTINUE - VALUE = SCALE*SQRT( SUM ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANSP = VALUE diff --git a/lapack-netlib/SRC/zlansy.f b/lapack-netlib/SRC/zlansy.f index 58269a911..e022f85e1 100644 --- a/lapack-netlib/SRC/zlansy.f +++ b/lapack-netlib/SRC/zlansy.f @@ -128,6 +128,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER NORM, UPLO INTEGER LDA, N @@ -145,14 +146,17 @@ * .. * .. Local Scalars .. INTEGER I, J - DOUBLE PRECISION ABSA, SCALE, SUM, VALUE + DOUBLE PRECISION ABSA, SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT @@ -218,21 +222,39 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. +* + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE +* +* Sum off-diagonals * - SCALE = ZERO - SUM = ONE IF( LSAME( UPLO, 'U' ) ) THEN DO 110 J = 2, N - CALL ZLASSQ( J-1, A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 110 CONTINUE ELSE DO 120 J = 1, N - 1 - CALL ZLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 120 CONTINUE END IF - SUM = 2*SUM - CALL ZLASSQ( N, A, LDA+1, SCALE, SUM ) - VALUE = SCALE*SQRT( SUM ) + SSQ( 2 ) = 2*SSQ( 2 ) +* +* Sum diagonal +* + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANSY = VALUE diff --git a/lapack-netlib/SRC/zlantb.f b/lapack-netlib/SRC/zlantb.f index 3077ba151..f02509223 100644 --- a/lapack-netlib/SRC/zlantb.f +++ b/lapack-netlib/SRC/zlantb.f @@ -146,6 +146,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER K, LDAB, N @@ -164,14 +165,17 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, L - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -313,46 +317,61 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 280 J = 2, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL ZLASSQ( MIN( J-1, K ), - $ AB( MAX( K+2-J, 1 ), J ), 1, SCALE, - $ SUM ) + $ AB( MAX( K+2-J, 1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 280 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 290 J = 1, N + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE CALL ZLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ), - $ 1, SCALE, SUM ) + $ 1, COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N IF( K.GT.0 ) THEN DO 300 J = 1, N - 1 - CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 310 J = 1, N - CALL ZLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANTB = VALUE diff --git a/lapack-netlib/SRC/zlantp.f b/lapack-netlib/SRC/zlantp.f index 69dbaa5bc..d32a00f13 100644 --- a/lapack-netlib/SRC/zlantp.f +++ b/lapack-netlib/SRC/zlantp.f @@ -130,6 +130,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER N @@ -148,14 +149,17 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J, K - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, SQRT @@ -308,45 +312,64 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 280 J = 2, N - CALL ZLASSQ( J-1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( J-1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + J 280 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 290 J = 1, N - CALL ZLASSQ( J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + J 290 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = N + SSQ( 1 ) = ONE + SSQ( 2 ) = N K = 2 DO 300 J = 1, N - 1 - CALL ZLASSQ( N-J, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N-J, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 300 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE K = 1 DO 310 J = 1, N - CALL ZLASSQ( N-J+1, AP( K ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( N-J+1, AP( K ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) K = K + N - J + 1 310 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANTP = VALUE diff --git a/lapack-netlib/SRC/zlantr.f b/lapack-netlib/SRC/zlantr.f index 04ee482f7..7d63c972e 100644 --- a/lapack-netlib/SRC/zlantr.f +++ b/lapack-netlib/SRC/zlantr.f @@ -147,6 +147,7 @@ * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * December 2016 * + IMPLICIT NONE * .. Scalar Arguments .. CHARACTER DIAG, NORM, UPLO INTEGER LDA, M, N @@ -165,14 +166,17 @@ * .. Local Scalars .. LOGICAL UDIAG INTEGER I, J - DOUBLE PRECISION SCALE, SUM, VALUE + DOUBLE PRECISION SUM, VALUE +* .. +* .. Local Arrays .. + DOUBLE PRECISION SSQ( 2 ), COLSSQ( 2 ) * .. * .. External Functions .. LOGICAL LSAME, DISNAN EXTERNAL LSAME, DISNAN * .. * .. External Subroutines .. - EXTERNAL ZLASSQ + EXTERNAL ZLASSQ, DCOMBSSQ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN, SQRT @@ -283,7 +287,7 @@ END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - DO 210 I = 1, N + DO 210 I = 1, MIN( M, N ) WORK( I ) = ONE 210 CONTINUE DO 220 I = N + 1, M @@ -313,38 +317,56 @@ ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN * * Find normF(A). +* SSQ(1) is scale +* SSQ(2) is sum-of-squares +* For better accuracy, sum each column separately. * IF( LSAME( UPLO, 'U' ) ) THEN IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 290 J = 2, N - CALL ZLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( M, J-1 ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 290 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 300 J = 1, N - CALL ZLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( MIN( M, J ), A( 1, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 300 CONTINUE END IF ELSE IF( LSAME( DIAG, 'U' ) ) THEN - SCALE = ONE - SUM = MIN( M, N ) + SSQ( 1 ) = ONE + SSQ( 2 ) = MIN( M, N ) DO 310 J = 1, N - CALL ZLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE, - $ SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 310 CONTINUE ELSE - SCALE = ZERO - SUM = ONE + SSQ( 1 ) = ZERO + SSQ( 2 ) = ONE DO 320 J = 1, N - CALL ZLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM ) + COLSSQ( 1 ) = ZERO + COLSSQ( 2 ) = ONE + CALL ZLASSQ( M-J+1, A( J, J ), 1, + $ COLSSQ( 1 ), COLSSQ( 2 ) ) + CALL DCOMBSSQ( SSQ, COLSSQ ) 320 CONTINUE END IF END IF - VALUE = SCALE*SQRT( SUM ) + VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) ) END IF * ZLANTR = VALUE diff --git a/lapack-netlib/SRC/zlaqps.f b/lapack-netlib/SRC/zlaqps.f index c142e8c69..66c721517 100644 --- a/lapack-netlib/SRC/zlaqps.f +++ b/lapack-netlib/SRC/zlaqps.f @@ -127,7 +127,7 @@ *> \param[in,out] AUXV *> \verbatim *> AUXV is COMPLEX*16 array, dimension (NB) -*> Auxiliar vector. +*> Auxiliary vector. *> \endverbatim *> *> \param[in,out] F diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f index 59b8ed7a6..feffe9782 100644 --- a/lapack-netlib/SRC/zlaqr0.f +++ b/lapack-netlib/SRC/zlaqr0.f @@ -66,7 +66,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -79,12 +79,12 @@ *> IHI is INTEGER *> *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to ZGEBAL, and then passed to ZGEHRD when the *> matrix output by ZGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -96,17 +96,17 @@ *> contains the upper triangular matrix T from the Schur *> decomposition (the Schur form). If INFO = 0 and WANT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] W @@ -128,7 +128,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -138,7 +138,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -146,7 +146,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -159,7 +159,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -175,19 +175,19 @@ *> \param[out] INFO *> \verbatim *> INFO is INTEGER -*> = 0: successful exit -*> .GT. 0: if INFO = i, ZLAQR0 failed to compute all of +*> = 0: successful exit +*> > 0: if INFO = i, ZLAQR0 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -195,7 +195,7 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -203,7 +203,7 @@ *> where U is the unitary matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -641,7 +641,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/zlaqr1.f b/lapack-netlib/SRC/zlaqr1.f index 34341cb10..fc2df3cb4 100644 --- a/lapack-netlib/SRC/zlaqr1.f +++ b/lapack-netlib/SRC/zlaqr1.f @@ -64,7 +64,7 @@ *> \verbatim *> LDH is INTEGER *> The leading dimension of H as declared in -*> the calling procedure. LDH.GE.N +*> the calling procedure. LDH >= N *> \endverbatim *> *> \param[in] S1 diff --git a/lapack-netlib/SRC/zlaqr2.f b/lapack-netlib/SRC/zlaqr2.f index e6e2ea48c..b5434e899 100644 --- a/lapack-netlib/SRC/zlaqr2.f +++ b/lapack-netlib/SRC/zlaqr2.f @@ -103,7 +103,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -121,7 +121,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -133,7 +133,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -149,7 +149,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -186,13 +186,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -204,14 +204,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -223,7 +223,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/zlaqr3.f b/lapack-netlib/SRC/zlaqr3.f index 64ab59f31..dfb798ca9 100644 --- a/lapack-netlib/SRC/zlaqr3.f +++ b/lapack-netlib/SRC/zlaqr3.f @@ -100,7 +100,7 @@ *> \param[in] NW *> \verbatim *> NW is INTEGER -*> Deflation window size. 1 .LE. NW .LE. (KBOT-KTOP+1). +*> Deflation window size. 1 <= NW <= (KBOT-KTOP+1). *> \endverbatim *> *> \param[in,out] H @@ -118,7 +118,7 @@ *> \verbatim *> LDH is INTEGER *> Leading dimension of H just as declared in the calling -*> subroutine. N .LE. LDH +*> subroutine. N <= LDH *> \endverbatim *> *> \param[in] ILOZ @@ -130,7 +130,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N. +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -146,7 +146,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of Z just as declared in the -*> calling subroutine. 1 .LE. LDZ. +*> calling subroutine. 1 <= LDZ. *> \endverbatim *> *> \param[out] NS @@ -183,13 +183,13 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of V just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is INTEGER -*> The number of columns of T. NH.GE.NW. +*> The number of columns of T. NH >= NW. *> \endverbatim *> *> \param[out] T @@ -201,14 +201,14 @@ *> \verbatim *> LDT is INTEGER *> The leading dimension of T just as declared in the -*> calling subroutine. NW .LE. LDT +*> calling subroutine. NW <= LDT *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> The number of rows of work array WV available for -*> workspace. NV.GE.NW. +*> workspace. NV >= NW. *> \endverbatim *> *> \param[out] WV @@ -220,7 +220,7 @@ *> \verbatim *> LDWV is INTEGER *> The leading dimension of W just as declared in the -*> calling subroutine. NW .LE. LDV +*> calling subroutine. NW <= LDV *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f index 012fa37e2..a88f6508e 100644 --- a/lapack-netlib/SRC/zlaqr4.f +++ b/lapack-netlib/SRC/zlaqr4.f @@ -73,7 +73,7 @@ *> \param[in] N *> \verbatim *> N is INTEGER -*> The order of the matrix H. N .GE. 0. +*> The order of the matrix H. N >= 0. *> \endverbatim *> *> \param[in] ILO @@ -85,12 +85,12 @@ *> \verbatim *> IHI is INTEGER *> It is assumed that H is already upper triangular in rows -*> and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1, +*> and columns 1:ILO-1 and IHI+1:N and, if ILO > 1, *> H(ILO,ILO-1) is zero. ILO and IHI are normally set by a *> previous call to ZGEBAL, and then passed to ZGEHRD when the *> matrix output by ZGEBAL is reduced to Hessenberg form. *> Otherwise, ILO and IHI should be set to 1 and N, -*> respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N. +*> respectively. If N > 0, then 1 <= ILO <= IHI <= N. *> If N = 0, then ILO = 1 and IHI = 0. *> \endverbatim *> @@ -102,17 +102,17 @@ *> contains the upper triangular matrix T from the Schur *> decomposition (the Schur form). If INFO = 0 and WANT is *> .FALSE., then the contents of H are unspecified on exit. -*> (The output value of H when INFO.GT.0 is given under the +*> (The output value of H when INFO > 0 is given under the *> description of INFO below.) *> -*> This subroutine may explicitly set H(i,j) = 0 for i.GT.j and +*> This subroutine may explicitly set H(i,j) = 0 for i > j and *> j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is INTEGER -*> The leading dimension of the array H. LDH .GE. max(1,N). +*> The leading dimension of the array H. LDH >= max(1,N). *> \endverbatim *> *> \param[out] W @@ -134,7 +134,7 @@ *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. -*> 1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N. +*> 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. *> \endverbatim *> *> \param[in,out] Z @@ -144,7 +144,7 @@ *> If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is *> replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the *> orthogonal Schur factor of H(ILO:IHI,ILO:IHI). -*> (The output value of Z when INFO.GT.0 is given under +*> (The output value of Z when INFO > 0 is given under *> the description of INFO below.) *> \endverbatim *> @@ -152,7 +152,7 @@ *> \verbatim *> LDZ is INTEGER *> The leading dimension of the array Z. if WANTZ is .TRUE. -*> then LDZ.GE.MAX(1,IHIZ). Otherwize, LDZ.GE.1. +*> then LDZ >= MAX(1,IHIZ). Otherwise, LDZ >= 1. *> \endverbatim *> *> \param[out] WORK @@ -165,7 +165,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK .GE. max(1,N) +*> The dimension of the array WORK. LWORK >= max(1,N) *> is sufficient, but LWORK typically as large as 6*N may *> be required for optimal performance. A workspace query *> to determine the optimal workspace size is recommended. @@ -182,18 +182,18 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> .GT. 0: if INFO = i, ZLAQR4 failed to compute all of +*> > 0: if INFO = i, ZLAQR4 failed to compute all of *> the eigenvalues. Elements 1:ilo-1 and i+1:n of WR *> and WI contain those eigenvalues which have been *> successfully computed. (Failures are rare.) *> -*> If INFO .GT. 0 and WANT is .FALSE., then on exit, +*> If INFO > 0 and WANT is .FALSE., then on exit, *> the remaining unconverged eigenvalues are the eigen- *> values of the upper Hessenberg matrix rows and *> columns ILO through INFO of the final, output *> value of H. *> -*> If INFO .GT. 0 and WANTT is .TRUE., then on exit +*> If INFO > 0 and WANTT is .TRUE., then on exit *> *> (*) (initial value of H)*U = U*(final value of H) *> @@ -201,7 +201,7 @@ *> value of H is upper Hessenberg and triangular in *> rows and columns INFO+1 through IHI. *> -*> If INFO .GT. 0 and WANTZ is .TRUE., then on exit +*> If INFO > 0 and WANTZ is .TRUE., then on exit *> *> (final value of Z(ILO:IHI,ILOZ:IHIZ) *> = (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U @@ -209,7 +209,7 @@ *> where U is the unitary matrix in (*) (regard- *> less of the value of WANTT.) *> -*> If INFO .GT. 0 and WANTZ is .FALSE., then Z is not +*> If INFO > 0 and WANTZ is .FALSE., then Z is not *> accessed. *> \endverbatim * @@ -641,7 +641,7 @@ END IF END IF * -* ==== Use up to NS of the the smallest magnatiude +* ==== Use up to NS of the the smallest magnitude * . shifts. If there aren't NS shifts available, * . then use them all, possibly dropping one to * . make the number of shifts even. ==== diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 0dfbce82c..9ff7e7eca 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -125,7 +125,7 @@ *> \verbatim *> LDH is INTEGER *> LDH is the leading dimension of H just as declared in the -*> calling procedure. LDH.GE.MAX(1,N). +*> calling procedure. LDH >= MAX(1,N). *> \endverbatim *> *> \param[in] ILOZ @@ -137,7 +137,7 @@ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be -*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N +*> applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N *> \endverbatim *> *> \param[in,out] Z @@ -153,7 +153,7 @@ *> \verbatim *> LDZ is INTEGER *> LDA is the leading dimension of Z just as declared in -*> the calling procedure. LDZ.GE.N. +*> the calling procedure. LDZ >= N. *> \endverbatim *> *> \param[out] V @@ -165,7 +165,7 @@ *> \verbatim *> LDV is INTEGER *> LDV is the leading dimension of V as declared in the -*> calling procedure. LDV.GE.3. +*> calling procedure. LDV >= 3. *> \endverbatim *> *> \param[out] U @@ -177,33 +177,14 @@ *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU.GE.3*NSHFTS-3. -*> \endverbatim -*> -*> \param[in] NH -*> \verbatim -*> NH is INTEGER -*> NH is the number of columns in array WH available for -*> workspace. NH.GE.1. -*> \endverbatim -*> -*> \param[out] WH -*> \verbatim -*> WH is COMPLEX*16 array, dimension (LDWH,NH) -*> \endverbatim -*> -*> \param[in] LDWH -*> \verbatim -*> LDWH is INTEGER -*> Leading dimension of WH just as declared in the -*> calling procedure. LDWH.GE.3*NSHFTS-3. +*> in the calling subroutine. LDU >= 3*NSHFTS-3. *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is INTEGER *> NV is the number of rows in WV agailable for workspace. -*> NV.GE.1. +*> NV >= 1. *> \endverbatim *> *> \param[out] WV @@ -215,9 +196,28 @@ *> \verbatim *> LDWV is INTEGER *> LDWV is the leading dimension of WV as declared in the -*> in the calling subroutine. LDWV.GE.NV. +*> in the calling subroutine. LDWV >= NV. *> \endverbatim * +*> \param[in] NH +*> \verbatim +*> NH is INTEGER +*> NH is the number of columns in array WH available for +*> workspace. NH >= 1. +*> \endverbatim +*> +*> \param[out] WH +*> \verbatim +*> WH is COMPLEX*16 array, dimension (LDWH,NH) +*> \endverbatim +*> +*> \param[in] LDWH +*> \verbatim +*> LDWH is INTEGER +*> Leading dimension of WH just as declared in the +*> calling procedure. LDWH >= 3*NSHFTS-3. +*> \endverbatim +*> * Authors: * ======== * diff --git a/lapack-netlib/SRC/zlarfb.f b/lapack-netlib/SRC/zlarfb.f index b4a2b4d1a..3da49f2fc 100644 --- a/lapack-netlib/SRC/zlarfb.f +++ b/lapack-netlib/SRC/zlarfb.f @@ -92,6 +92,8 @@ *> K is INTEGER *> The order of the matrix T (= the number of elementary *> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. *> \endverbatim *> *> \param[in] V diff --git a/lapack-netlib/SRC/zlarfx.f b/lapack-netlib/SRC/zlarfx.f index 685d164eb..ba6d4ed74 100644 --- a/lapack-netlib/SRC/zlarfx.f +++ b/lapack-netlib/SRC/zlarfx.f @@ -94,7 +94,7 @@ *> \param[in] LDC *> \verbatim *> LDC is INTEGER -*> The leading dimension of the array C. LDA >= max(1,M). +*> The leading dimension of the array C. LDC >= max(1,M). *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/zlarfy.f b/lapack-netlib/SRC/zlarfy.f index 57605731b..4c9e08bac 100644 --- a/lapack-netlib/SRC/zlarfy.f +++ b/lapack-netlib/SRC/zlarfy.f @@ -103,7 +103,7 @@ * *> \date December 2016 * -*> \ingroup complex16_eig +*> \ingroup complex16OTHERauxiliary * * ===================================================================== SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK ) diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f index 67a67584c..23976dbef 100644 --- a/lapack-netlib/SRC/zlarrv.f +++ b/lapack-netlib/SRC/zlarrv.f @@ -143,7 +143,7 @@ *> RTOL2 is DOUBLE PRECISION *> Parameters for bisection. *> An interval [LEFT,RIGHT] has converged if -*> RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) +*> RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) ) *> \endverbatim *> *> \param[in,out] W diff --git a/lapack-netlib/SRC/zlassq.f b/lapack-netlib/SRC/zlassq.f index fd13811bd..dccec988d 100644 --- a/lapack-netlib/SRC/zlassq.f +++ b/lapack-netlib/SRC/zlassq.f @@ -41,7 +41,7 @@ *> where x( i ) = abs( X( 1 + ( i - 1 )*INCX ) ). The value of sumsq is *> assumed to be at least unity and the value of ssq will then satisfy *> -*> 1.0 .le. ssq .le. ( sumsq + 2*n ). +*> 1.0 <= ssq <= ( sumsq + 2*n ). *> *> scale is assumed to be non-negative and scl returns the value *> @@ -65,7 +65,7 @@ *> *> \param[in] X *> \verbatim -*> X is COMPLEX*16 array, dimension (N) +*> X is COMPLEX*16 array, dimension (1+(N-1)*INCX) *> The vector x as described above. *> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. *> \endverbatim diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f index 24dd41d79..990630925 100644 --- a/lapack-netlib/SRC/zlaswlq.f +++ b/lapack-netlib/SRC/zlaswlq.f @@ -1,3 +1,4 @@ +*> \brief \b ZLASWLQ * * Definition: * =========== @@ -18,9 +19,20 @@ *> *> \verbatim *> -*> ZLASWLQ computes a blocked Short-Wide LQ factorization of a -*> M-by-N matrix A, where N >= M: -*> A = L * Q +*> ZLASWLQ computes a blocked Tall-Skinny LQ factorization of +*> a complexx M-by-N matrix A for M <= N: +*> +*> A = ( L 0 ) * Q, +*> +*> where: +*> +*> Q is a n-by-N orthogonal matrix, stored on exit in an implicit +*> form in the elements above the digonal of the array A and in +*> the elemenst of the array T; +*> L is an lower-triangular M-by-M matrix stored on exit in +*> the elements on and below the diagonal of the array A. +*> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored. +*> *> \endverbatim * * Arguments: @@ -150,7 +162,7 @@ SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, $ INFO) * -* -- LAPACK computational routine (version 3.7.1) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * June 2017 diff --git a/lapack-netlib/SRC/zlasyf_aa.f b/lapack-netlib/SRC/zlasyf_aa.f index f321b72de..b1f1c2790 100644 --- a/lapack-netlib/SRC/zlasyf_aa.f +++ b/lapack-netlib/SRC/zlasyf_aa.f @@ -284,8 +284,9 @@ * * Swap A(I1, I2+1:M) with A(I2, I2+1:M) * - CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, - $ A( J1+I2-1, I2+1 ), LDA ) + IF( I2.LT.M ) + $ CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA, + $ A( J1+I2-1, I2+1 ), LDA ) * * Swap A(I1, I1) with A(I2,I2) * @@ -325,13 +326,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( K, J+1 ).NE.ZERO ) THEN - ALPHA = ONE / A( K, J+1 ) - CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) - CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) - ELSE - CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO, - $ A( K, J+2 ), LDA) + IF( J.LT.(M-1) ) THEN + IF( A( K, J+1 ).NE.ZERO ) THEN + ALPHA = ONE / A( K, J+1 ) + CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA ) + CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA ) + ELSE + CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO, + $ A( K, J+2 ), LDA) + END IF END IF END IF J = J + 1 @@ -432,8 +435,9 @@ * * Swap A(I2+1:M, I1) with A(I2+1:M, I2) * - CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, - $ A( I2+1, J1+I2-1 ), 1 ) + IF( I2.LT.M ) + $ CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1, + $ A( I2+1, J1+I2-1 ), 1 ) * * Swap A(I1, I1) with A(I2, I2) * @@ -473,13 +477,15 @@ * Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1), * where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1) * - IF( A( J+1, K ).NE.ZERO ) THEN - ALPHA = ONE / A( J+1, K ) - CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) - CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) - ELSE - CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO, - $ A( J+2, K ), LDA ) + IF( J.LT.(M-1) ) THEN + IF( A( J+1, K ).NE.ZERO ) THEN + ALPHA = ONE / A( J+1, K ) + CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 ) + CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 ) + ELSE + CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO, + $ A( J+2, K ), LDA ) + END IF END IF END IF J = J + 1 diff --git a/lapack-netlib/SRC/zlasyf_rk.f b/lapack-netlib/SRC/zlasyf_rk.f index 664ed93f3..b6c5a27c6 100644 --- a/lapack-netlib/SRC/zlasyf_rk.f +++ b/lapack-netlib/SRC/zlasyf_rk.f @@ -330,7 +330,7 @@ * of A and working backwards, and compute the matrix W = U12*D * for use in updating A11 * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -658,7 +658,7 @@ * of A and working forwards, and compute the matrix W = L21*D * for use in updating A22 * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/zlatdf.f b/lapack-netlib/SRC/zlatdf.f index ab88570c5..4b8b5e330 100644 --- a/lapack-netlib/SRC/zlatdf.f +++ b/lapack-netlib/SRC/zlatdf.f @@ -261,7 +261,7 @@ * * Solve for U- part, lockahead for RHS(N) = +-1. This is not done * In BSOLVE and will hopefully give us a better estimate because -* any ill-conditioning of the original matrix is transfered to U +* any ill-conditioning of the original matrix is transferred to U * and not to L. U(N, N) is an approximation to sigma_min(LU). * CALL ZCOPY( N-1, RHS, 1, WORK, 1 ) diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f index 1fdf3be24..0f98cae93 100644 --- a/lapack-netlib/SRC/zlatsqr.f +++ b/lapack-netlib/SRC/zlatsqr.f @@ -1,3 +1,4 @@ +*> \brief \b ZLATSQR * * Definition: * =========== @@ -18,9 +19,23 @@ *> *> \verbatim *> -*> SLATSQR computes a blocked Tall-Skinny QR factorization of -*> an M-by-N matrix A, where M >= N: -*> A = Q * R . +*> ZLATSQR computes a blocked Tall-Skinny QR factorization of +*> a complex M-by-N matrix A for M >= N: +*> +*> A = Q * ( R ), +*> ( 0 ) +*> +*> where: +*> +*> Q is a M-by-M orthogonal matrix, stored on exit in an implicit +*> form in the elements below the digonal of the array A and in +*> the elemenst of the array T; +*> +*> R is an upper-triangular N-by-N matrix, stored on exit in +*> the elements on and above the diagonal of the array A. +*> +*> 0 is a (M-N)-by-N zero matrix, and is not stored. +*> *> \endverbatim * * Arguments: @@ -149,10 +164,10 @@ SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, $ LWORK, INFO) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- -* December 2016 +* November 2019 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK diff --git a/lapack-netlib/SRC/zlaunhr_col_getrfnp.f b/lapack-netlib/SRC/zlaunhr_col_getrfnp.f new file mode 100644 index 000000000..0ab7f0349 --- /dev/null +++ b/lapack-netlib/SRC/zlaunhr_col_getrfnp.f @@ -0,0 +1,248 @@ +*> \brief \b ZLAUNHR_COL_GETRFNP +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAUNHR_COL_GETRFNP + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAUNHR_COL_GETRFNP computes the modified LU factorization without +*> pivoting of a complex general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is +*> at least one in absolute value (so that division-by-zero not +*> not possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the blocked right-looking version of the algorithm, +*> calling Level 3 BLAS to update the submatrix. To factorize a block, +*> this routine calls the recursive routine ZLAUNHR_COL_GETRFNP2. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is COMPLEX*16 array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be +*> only ( +1.0, 0.0 ) or (-1.0, 0.0 ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex16GEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZLAUNHR_COL_GETRFNP2, ZTRSM, XERBLA +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLAUNHR_COL_GETRFNP', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + + NB = ILAENV( 1, 'ZLAUNHR_COL_GETRFNP', ' ', M, N, -1, -1 ) + + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL ZLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + ELSE +* +* Use blocked code. +* + DO J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks. +* + CALL ZLAUNHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA, + $ D( J ), IINFO ) +* + IF( J+JB.LE.N ) THEN +* +* Compute block row of U. +* + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, CONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL ZGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -CONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, CONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + END DO + END IF + RETURN +* +* End of ZLAUNHR_COL_GETRFNP +* + END diff --git a/lapack-netlib/SRC/zlaunhr_col_getrfnp2.f b/lapack-netlib/SRC/zlaunhr_col_getrfnp2.f new file mode 100644 index 000000000..0057e430d --- /dev/null +++ b/lapack-netlib/SRC/zlaunhr_col_getrfnp2.f @@ -0,0 +1,314 @@ +*> \brief \b ZLAUNHR_COL_GETRFNP2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAUNHR_COL_GETRFNP2 + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE ZLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), D( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAUNHR_COL_GETRFNP2 computes the modified LU factorization without +*> pivoting of a complex general M-by-N matrix A. The factorization has +*> the form: +*> +*> A - S = L * U, +*> +*> where: +*> S is a m-by-n diagonal sign matrix with the diagonal D, so that +*> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed +*> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing +*> i-1 steps of Gaussian elimination. This means that the diagonal +*> element at each step of "modified" Gaussian elimination is at +*> least one in absolute value (so that division-by-zero not +*> possible during the division by the diagonal element); +*> +*> L is a M-by-N lower triangular matrix with unit diagonal elements +*> (lower trapezoidal if M > N); +*> +*> and U is a M-by-N upper triangular matrix +*> (upper trapezoidal if M < N). +*> +*> This routine is an auxiliary routine used in the Householder +*> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is +*> applied to an M-by-N matrix A with orthonormal columns, where each +*> element is bounded by one in absolute value. With the choice of +*> the matrix S above, one can show that the diagonal element at each +*> step of Gaussian elimination is the largest (in absolute value) in +*> the column on or below the diagonal, so that no pivoting is required +*> for numerical stability [1]. +*> +*> For more details on the Householder reconstruction algorithm, +*> including the modified LU factorization, see [1]. +*> +*> This is the recursive version of the LU factorization algorithm. +*> Denote A - S by B. The algorithm divides the matrix B into four +*> submatrices: +*> +*> [ B11 | B12 ] where B11 is n1 by n1, +*> B = [ -----|----- ] B21 is (m-n1) by n1, +*> [ B21 | B22 ] B12 is n1 by n2, +*> B22 is (m-n1) by n2, +*> with n1 = min(m,n)/2, n2 = n-n1. +*> +*> +*> The subroutine calls itself to factor B11, solves for B21, +*> solves for B12, updates B22, then calls itself to factor B22. +*> +*> For more details on the recursive LU algorithm, see [2]. +*> +*> ZLAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked +*> routine ZLAUNHR_COL_GETRFNP, which uses blocked code calling +*. Level 3 BLAS to update the submatrix. However, ZLAUNHR_COL_GETRFNP2 +*> is self-sufficient and can be used without ZLAUNHR_COL_GETRFNP. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> +*> [2] "Recursion leads to automatic variable blocking for dense linear +*> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev., +*> vol. 41, no. 6, pp. 737-755, 1997. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the M-by-N matrix to be factored. +*> On exit, the factors L and U from the factorization +*> A-S=L*U; the unit diagonal elements of L are not stored. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is COMPLEX*16 array, dimension min(M,N) +*> The diagonal elements of the diagonal M-by-N sign matrix S, +*> D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be +*> only ( +1.0, 0.0 ) or (-1.0, 0.0 ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex16GEcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + RECURSIVE SUBROUTINE ZLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), D( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION SFMIN + INTEGER I, IINFO, N1, N2 + COMPLEX*16 Z +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZSCAL, ZTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCMPLX, DIMAG, DSIGN, MAX, MIN +* .. +* .. Statement Functions .. + DOUBLE PRECISION CABS1 +* .. +* .. Statement Function definitions .. + CABS1( Z ) = ABS( DBLE( Z ) ) + ABS( DIMAG( Z ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLAUNHR_COL_GETRFNP2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) + $ RETURN + + IF ( M.EQ.1 ) THEN +* +* One row case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = DCMPLX( -DSIGN( ONE, DBLE( A( 1, 1 ) ) ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* + ELSE IF( N.EQ.1 ) THEN +* +* One column case, (also recursion termination case), +* use unblocked code +* +* Transfer the sign +* + D( 1 ) = DCMPLX( -DSIGN( ONE, DBLE( A( 1, 1 ) ) ) ) +* +* Construct the row of U +* + A( 1, 1 ) = A( 1, 1 ) - D( 1 ) +* +* Scale the elements 2:M of the column +* +* Determine machine safe minimum +* + SFMIN = DLAMCH('S') +* +* Construct the subdiagonal elements of L +* + IF( CABS1( A( 1, 1 ) ) .GE. SFMIN ) THEN + CALL ZSCAL( M-1, CONE / A( 1, 1 ), A( 2, 1 ), 1 ) + ELSE + DO I = 2, M + A( I, 1 ) = A( I, 1 ) / A( 1, 1 ) + END DO + END IF +* + ELSE +* +* Divide the matrix B into four submatrices +* + N1 = MIN( M, N ) / 2 + N2 = N-N1 + +* +* Factor B11, recursive call +* + CALL ZLAUNHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO ) +* +* Solve for B21 +* + CALL ZTRSM( 'R', 'U', 'N', 'N', M-N1, N1, CONE, A, LDA, + $ A( N1+1, 1 ), LDA ) +* +* Solve for B12 +* + CALL ZTRSM( 'L', 'L', 'N', 'U', N1, N2, CONE, A, LDA, + $ A( 1, N1+1 ), LDA ) +* +* Update B22, i.e. compute the Schur complement +* B22 := B22 - B21*B12 +* + CALL ZGEMM( 'N', 'N', M-N1, N2, N1, -CONE, A( N1+1, 1 ), LDA, + $ A( 1, N1+1 ), LDA, CONE, A( N1+1, N1+1 ), LDA ) +* +* Factor B22, recursive call +* + CALL ZLAUNHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA, + $ D( N1+1 ), IINFO ) +* + END IF + RETURN +* +* End of ZLAUNHR_COL_GETRFNP2 +* + END diff --git a/lapack-netlib/SRC/zporfsx.f b/lapack-netlib/SRC/zporfsx.f index ee8cfbc6a..bbff4331e 100644 --- a/lapack-netlib/SRC/zporfsx.f +++ b/lapack-netlib/SRC/zporfsx.f @@ -44,7 +44,7 @@ *> \verbatim *> *> ZPORFSX improves the computed solution to a system of linear -*> equations when the coefficient matrix is symmetric positive +*> equations when the coefficient matrix is Hermitian positive *> definite, and provides error bounds and backward error estimates *> for the solution. In addition to normwise error bound, the code *> provides maximum componentwise error bound if possible. See @@ -103,7 +103,7 @@ *> \param[in] A *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) -*> The symmetric matrix A. If UPLO = 'U', the leading N-by-N +*> The Hermitian matrix A. If UPLO = 'U', the leading N-by-N *> upper triangular part of A contains the upper triangular part *> of the matrix A, and the strictly lower triangular part of A *> is not referenced. If UPLO = 'L', the leading N-by-N lower @@ -134,7 +134,7 @@ *> \param[in,out] S *> \verbatim *> S is DOUBLE PRECISION array, dimension (N) -*> The row scale factors for A. If EQUED = 'Y', A is multiplied on +*> The scale factors for A. If EQUED = 'Y', A is multiplied on *> the left and right by diag(S). S is an input argument if FACT = *> 'F'; otherwise, S is an output argument. If FACT = 'F' and EQUED *> = 'Y', each element of S must be positive. If S is output, each @@ -262,7 +262,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -298,14 +298,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -313,9 +313,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/zposvxx.f b/lapack-netlib/SRC/zposvxx.f index 8126f14be..913d16cb2 100644 --- a/lapack-netlib/SRC/zposvxx.f +++ b/lapack-netlib/SRC/zposvxx.f @@ -45,7 +45,7 @@ *> *> ZPOSVXX uses the Cholesky factorization A = U**T*U or A = L*L**T *> to compute the solution to a complex*16 system of linear equations -*> A * X = B, where A is an N-by-N symmetric positive definite matrix +*> A * X = B, where A is an N-by-N Hermitian positive definite matrix *> and X and B are N-by-NRHS matrices. *> *> If requested, both normwise and maximum componentwise error bounds @@ -157,7 +157,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) -*> On entry, the symmetric matrix A, except if FACT = 'F' and EQUED = +*> On entry, the Hermitian matrix A, except if FACT = 'F' and EQUED = *> 'Y', then A must contain the equilibrated matrix *> diag(S)*A*diag(S). If UPLO = 'U', the leading N-by-N upper *> triangular part of A contains the upper triangular part of the @@ -365,7 +365,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -401,14 +401,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -416,9 +416,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/zpotrf2.f b/lapack-netlib/SRC/zpotrf2.f index e37c9f6d6..85c434d47 100644 --- a/lapack-netlib/SRC/zpotrf2.f +++ b/lapack-netlib/SRC/zpotrf2.f @@ -24,7 +24,7 @@ *> *> \verbatim *> -*> ZPOTRF2 computes the Cholesky factorization of a real symmetric +*> ZPOTRF2 computes the Cholesky factorization of a Hermitian *> positive definite matrix A using the recursive algorithm. *> *> The factorization has the form @@ -63,7 +63,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) -*> On entry, the symmetric matrix A. If UPLO = 'U', the leading +*> On entry, the Hermitian matrix A. If UPLO = 'U', the leading *> N-by-N upper triangular part of A contains the upper *> triangular part of the matrix A, and the strictly lower *> triangular part of A is not referenced. If UPLO = 'L', the diff --git a/lapack-netlib/SRC/zstemr.f b/lapack-netlib/SRC/zstemr.f index ac7552a6a..8685542de 100644 --- a/lapack-netlib/SRC/zstemr.f +++ b/lapack-netlib/SRC/zstemr.f @@ -250,13 +250,13 @@ *> \param[in,out] TRYRAC *> \verbatim *> TRYRAC is LOGICAL -*> If TRYRAC.EQ..TRUE., indicates that the code should check whether +*> If TRYRAC = .TRUE., indicates that the code should check whether *> the tridiagonal matrix defines its eigenvalues to high relative *> accuracy. If so, the code uses relative-accuracy preserving *> algorithms that might be (a bit) slower depending on the matrix. *> If the matrix does not define its eigenvalues to high relative *> accuracy, the code can uses possibly faster algorithms. -*> If TRYRAC.EQ..FALSE., the code is not required to guarantee +*> If TRYRAC = .FALSE., the code is not required to guarantee *> relatively accurate eigenvalues and can use the fastest possible *> techniques. *> On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix diff --git a/lapack-netlib/SRC/zsycon_3.f b/lapack-netlib/SRC/zsycon_3.f index 856845960..33bd23849 100644 --- a/lapack-netlib/SRC/zsycon_3.f +++ b/lapack-netlib/SRC/zsycon_3.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE ZSYCON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND, -* WORK, IWORK, INFO ) +* WORK, INFO ) * * .. Scalar Arguments .. * CHARACTER UPLO @@ -27,7 +27,7 @@ * DOUBLE PRECISION ANORM, RCOND * .. * .. Array Arguments .. -* INTEGER IPIV( * ), IWORK( * ) +* INTEGER IPIV( * ) * COMPLEX*16 A( LDA, * ), E ( * ), WORK( * ) * .. * @@ -129,11 +129,6 @@ *> WORK is COMPLEX*16 array, dimension (2*N) *> \endverbatim *> -*> \param[out] IWORK -*> \verbatim -*> IWORK is INTEGER array, dimension (N) -*> \endverbatim -*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER diff --git a/lapack-netlib/SRC/zsyconvf.f b/lapack-netlib/SRC/zsyconvf.f index b26bfd63b..2d5ce882e 100644 --- a/lapack-netlib/SRC/zsyconvf.f +++ b/lapack-netlib/SRC/zsyconvf.f @@ -294,7 +294,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -347,7 +347,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -438,7 +438,7 @@ * * Convert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where k increases from 1 to N * I = 1 @@ -491,7 +491,7 @@ * * Revert PERMUTATIONS and IPIV * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/zsyconvf_rook.f b/lapack-netlib/SRC/zsyconvf_rook.f index 5c36f4bcd..410d2eb34 100644 --- a/lapack-netlib/SRC/zsyconvf_rook.f +++ b/lapack-netlib/SRC/zsyconvf_rook.f @@ -285,7 +285,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in factorization order where i decreases from N to 1 * I = N @@ -336,7 +336,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of upper part of A +* Apply permutations to submatrices of upper part of A * in reverse factorization order where i increases from 1 to N * I = 1 @@ -426,7 +426,7 @@ * * Convert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in factorization order where i increases from 1 to N * I = 1 @@ -477,7 +477,7 @@ * * Revert PERMUTATIONS * -* Apply permutaions to submatrices of lower part of A +* Apply permutations to submatrices of lower part of A * in reverse factorization order where i decreases from N to 1 * I = N diff --git a/lapack-netlib/SRC/zsyrfsx.f b/lapack-netlib/SRC/zsyrfsx.f index 3420d70cd..d086510d8 100644 --- a/lapack-netlib/SRC/zsyrfsx.f +++ b/lapack-netlib/SRC/zsyrfsx.f @@ -271,7 +271,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -307,14 +307,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -322,9 +322,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the double-precision refinement algorithm, +*> = 1.0: Use the double-precision refinement algorithm, *> possibly with doubled-single computations if the *> compilation environment does not support DOUBLE *> PRECISION. diff --git a/lapack-netlib/SRC/zsysv_aa.f b/lapack-netlib/SRC/zsysv_aa.f index 325d07c54..4e87bd105 100644 --- a/lapack-netlib/SRC/zsysv_aa.f +++ b/lapack-netlib/SRC/zsysv_aa.f @@ -42,7 +42,7 @@ *> matrices. *> *> Aasen's algorithm is used to factor A as -*> A = U * T * U**T, if UPLO = 'U', or +*> A = U**T * T * U, if UPLO = 'U', or *> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric tridiagonal. The factored @@ -86,7 +86,7 @@ *> *> On exit, if INFO = 0, the tridiagonal matrix T and the *> multipliers used to obtain the factor U or L from the -*> factorization A = U*T*U**T or A = L*T*L**T as computed by +*> factorization A = U**T*T*U or A = L*T*L**T as computed by *> ZSYTRF. *> \endverbatim *> @@ -230,7 +230,7 @@ RETURN END IF * -* Compute the factorization A = U*T*U**T or A = L*T*L**T. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL ZSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) IF( INFO.EQ.0 ) THEN diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f index 029ed587d..923eaaec0 100644 --- a/lapack-netlib/SRC/zsysv_aa_2stage.f +++ b/lapack-netlib/SRC/zsysv_aa_2stage.f @@ -43,8 +43,8 @@ *> matrices. *> *> Aasen's 2-stage algorithm is used to factor A as -*> A = U * T * U**H, if UPLO = 'U', or -*> A = L * T * L**H, if UPLO = 'L', +*> A = U**T * T * U, if UPLO = 'U', or +*> A = L * T * L**T, if UPLO = 'L', *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is symmetric and band. The matrix T is *> then LU-factored with partial pivoting. The factored form of A @@ -257,7 +257,7 @@ END IF * * -* Compute the factorization A = U*T*U**H or A = L*T*L**H. +* Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL ZSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, $ WORK, LWORK, INFO ) diff --git a/lapack-netlib/SRC/zsysvxx.f b/lapack-netlib/SRC/zsysvxx.f index ef44d09d3..e29439385 100644 --- a/lapack-netlib/SRC/zsysvxx.f +++ b/lapack-netlib/SRC/zsysvxx.f @@ -378,7 +378,7 @@ *> information as described below. There currently are up to three *> pieces of information returned for each right-hand side. If *> componentwise accuracy is not requested (PARAMS(3) = 0.0), then -*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS .LT. 3, then at most +*> ERR_BNDS_COMP is not accessed. If N_ERR_BNDS < 3, then at most *> the first (:,N_ERR_BNDS) entries are returned. *> *> The first index in ERR_BNDS_COMP(i,:) corresponds to the ith @@ -414,14 +414,14 @@ *> \param[in] NPARAMS *> \verbatim *> NPARAMS is INTEGER -*> Specifies the number of parameters set in PARAMS. If .LE. 0, the +*> Specifies the number of parameters set in PARAMS. If <= 0, the *> PARAMS array is never referenced and default values are used. *> \endverbatim *> *> \param[in,out] PARAMS *> \verbatim *> PARAMS is DOUBLE PRECISION array, dimension NPARAMS -*> Specifies algorithm parameters. If an entry is .LT. 0.0, then +*> Specifies algorithm parameters. If an entry is < 0.0, then *> that entry will be filled with default value used for that *> parameter. Only positions up to NPARAMS are accessed; defaults *> are used for higher-numbered parameters. @@ -429,9 +429,9 @@ *> PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative *> refinement or not. *> Default: 1.0D+0 -*> = 0.0 : No refinement is performed, and no error bounds are +*> = 0.0: No refinement is performed, and no error bounds are *> computed. -*> = 1.0 : Use the extra-precise refinement algorithm. +*> = 1.0: Use the extra-precise refinement algorithm. *> (other values are reserved for future use) *> *> PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual diff --git a/lapack-netlib/SRC/zsytf2_rk.f b/lapack-netlib/SRC/zsytf2_rk.f index b1a02f4a5..4ae1a4a22 100644 --- a/lapack-netlib/SRC/zsytf2_rk.f +++ b/lapack-netlib/SRC/zsytf2_rk.f @@ -321,7 +321,7 @@ * * Factorize A as U*D*U**T using the upper triangle of A * -* Initilize the first entry of array E, where superdiagonal +* Initialize the first entry of array E, where superdiagonal * elements of D are stored * E( 1 ) = CZERO @@ -632,7 +632,7 @@ * * Factorize A as L*D*L**T using the lower triangle of A * -* Initilize the unused last entry of the subdiagonal array E. +* Initialize the unused last entry of the subdiagonal array E. * E( N ) = CZERO * diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f index 663199c8a..54e22cca1 100644 --- a/lapack-netlib/SRC/zsytrf.f +++ b/lapack-netlib/SRC/zsytrf.f @@ -43,7 +43,7 @@ *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and D is symmetric and block diagonal with -*> with 1-by-1 and 2-by-2 diagonal blocks. +*> 1-by-1 and 2-by-2 diagonal blocks. *> *> This is the blocked version of the algorithm, calling Level 3 BLAS. *> \endverbatim diff --git a/lapack-netlib/SRC/zsytrf_aa.f b/lapack-netlib/SRC/zsytrf_aa.f index b25b1fbce..e547c6a60 100644 --- a/lapack-netlib/SRC/zsytrf_aa.f +++ b/lapack-netlib/SRC/zsytrf_aa.f @@ -37,7 +37,7 @@ *> ZSYTRF_AA computes the factorization of a complex symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a complex symmetric tridiagonal matrix. @@ -223,7 +223,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * * Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N)) @@ -256,7 +256,7 @@ $ A( MAX(1, J), J+1 ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J @@ -375,7 +375,7 @@ $ A( J+1, MAX(1, J) ), LDA, $ IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) ) * -* Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot) +* Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot) * DO J2 = J+2, MIN(N, J+JB+1) IPIV( J2 ) = IPIV( J2 ) + J diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f index d3486c1a7..67a1c1f6f 100644 --- a/lapack-netlib/SRC/zsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f @@ -38,7 +38,7 @@ *> ZSYTRF_AA_2STAGE computes the factorization of a complex symmetric matrix A *> using the Aasen's algorithm. The form of the factorization is *> -*> A = U*T*U**T or A = L*T*L**T +*> A = U**T*T*U or A = L*T*L**T *> *> where U (or L) is a product of permutation and unit upper (lower) *> triangular matrices, and T is a complex symmetric band matrix with the @@ -275,7 +275,7 @@ IF( UPPER ) THEN * * ..................................................... -* Factorize A as L*D*L**T using the upper triangle of A +* Factorize A as U**T*D*U using the upper triangle of A * ..................................................... * DO J = 0, NT-1 @@ -448,12 +448,14 @@ c END IF * > Apply pivots to previous columns of L CALL ZSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, $ A( (J+1)*NB+1, I2 ), 1 ) -* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA, - $ A( I1+1, I2 ), 1 ) +* > Swap A(I1+1:M, I1) with A(I2, I1+1:M) + IF( I2.GT.(I1+1) ) + $ CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA, + $ A( I1+1, I2 ), 1 ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA, - $ A( I2, I2+1 ), LDA ) + IF( I2.LT.N ) + $ CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA, + $ A( I2, I2+1 ), LDA ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) @@ -637,11 +639,13 @@ c END IF CALL ZSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, $ A( I2, (J+1)*NB+1 ), LDA ) * > Swap A(I1+1:M, I1) with A(I2, I1+1:M) - CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1, - $ A( I2, I1+1 ), LDA ) + IF( I2.GT.(I1+1) ) + $ CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1, + $ A( I2, I1+1 ), LDA ) * > Swap A(I2+1:M, I1) with A(I2+1:M, I2) - CALL ZSWAP( N-I2, A( I2+1, I1 ), 1, - $ A( I2+1, I2 ), 1 ) + IF( I2.LT.N ) + $ CALL ZSWAP( N-I2, A( I2+1, I1 ), 1, + $ A( I2+1, I2 ), 1 ) * > Swap A(I1, I1) with A(I2, I2) PIV = A( I1, I1 ) A( I1, I1 ) = A( I2, I2 ) diff --git a/lapack-netlib/SRC/zsytri2.f b/lapack-netlib/SRC/zsytri2.f index e7303c90b..9929eb2c6 100644 --- a/lapack-netlib/SRC/zsytri2.f +++ b/lapack-netlib/SRC/zsytri2.f @@ -62,7 +62,7 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers +*> On entry, the block diagonal matrix D and the multipliers *> used to obtain the factor U or L as computed by ZSYTRF. *> *> On exit, if INFO = 0, the (symmetric) inverse of the original @@ -82,7 +82,7 @@ *> \param[in] IPIV *> \verbatim *> IPIV is INTEGER array, dimension (N) -*> Details of the interchanges and the NB structure of D +*> Details of the interchanges and the block structure of D *> as determined by ZSYTRF. *> \endverbatim *> diff --git a/lapack-netlib/SRC/zsytrs2.f b/lapack-netlib/SRC/zsytrs2.f index c0ee206a5..6e9cca425 100644 --- a/lapack-netlib/SRC/zsytrs2.f +++ b/lapack-netlib/SRC/zsytrs2.f @@ -36,7 +36,7 @@ *> *> \verbatim *> -*> ZSYTRS2 solves a system of linear equations A*X = B with a real +*> ZSYTRS2 solves a system of linear equations A*X = B with a complex *> symmetric matrix A using the factorization A = U*D*U**T or *> A = L*D*L**T computed by ZSYTRF and converted by ZSYCONV. *> \endverbatim diff --git a/lapack-netlib/SRC/zsytrs_aa.f b/lapack-netlib/SRC/zsytrs_aa.f index e62e9e486..0f0664009 100644 --- a/lapack-netlib/SRC/zsytrs_aa.f +++ b/lapack-netlib/SRC/zsytrs_aa.f @@ -37,7 +37,7 @@ *> \verbatim *> *> ZSYTRS_AA solves a system of linear equations A*X = B with a complex -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by ZSYTRF_AA. *> \endverbatim * @@ -49,7 +49,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -97,14 +97,16 @@ *> The leading dimension of the array B. LDB >= max(1,N). *> \endverbatim *> -*> \param[in] WORK +*> \param[out] WORK *> \verbatim -*> WORK is DOUBLE array, dimension (MAX(1,LWORK)) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER, LWORK >= MAX(1,3*N-2). +*> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -198,22 +200,29 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. +* +* 1) Forward substitution with U**T +* + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B * -* Pivot, P**T * B + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO +* Compute U**T \ B -> B [ (U**T \P**T * B) ] * -* Compute (U \P**T * B) -> B [ (U \P**T * B) ] + CALL ZTRSM( 'L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) + END IF * - CALL ZTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) +* 2) Solve with triangular matrix T * -* Compute T \ B -> B [ T \ (U \P**T * B) ] +* Compute T \ B -> B [ T \ (U**T \P**T * B) ] * CALL ZLACPY( 'F', 1, N, A( 1, 1 ), LDA+1, WORK( N ), 1) IF( N.GT.1 ) THEN @@ -223,35 +232,47 @@ CALL ZGTSV( N, NRHS, WORK( 1 ), WORK( N ), WORK( 2*N ), B, LDB, $ INFO ) * -* Compute (U**T \ B) -> B [ U**T \ (T \ (U \P**T * B) ) ] +* 3) Backward substitution with U * - CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA, - $ B( 2, 1 ), LDB) + IF( N.GT.1 ) THEN * -* Pivot, P * B [ P * (U**T \ (T \ (U \P**T * B) )) ] +* Compute U \ B -> B [ U \ (T \ (U**T \P**T * B) ) ] * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), + $ LDA, B( 2, 1 ), LDB) +* +* Pivot, P * B -> B [ P * (U \ (T \ (U**T \P**T * B) )) ] +* + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * ELSE * * Solve A*X = B, where A = L*T*L**T. * -* Pivot, P**T * B +* 1) Forward substitution with L * - DO K = 1, N - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + IF( N.GT.1 ) THEN +* +* Pivot, P**T * B -> B * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] + DO K = 1, N + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO +* +* Compute L \ B -> B [ (L \P**T * B) ] +* + CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) + END IF * - CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) +* 2) Solve with triangular matrix T * * Compute T \ B -> B [ T \ (L \P**T * B) ] * @@ -263,18 +284,23 @@ CALL ZGTSV( N, NRHS, WORK( 1 ), WORK(N), WORK( 2*N ), B, LDB, $ INFO) * -* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] +* 3) Backward substitution with L**T * - CALL ZTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA, - $ B( 2, 1 ), LDB) + IF( N.GT.1 ) THEN * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* Compute (L**T \ B) -> B [ L**T \ (T \ (L \P**T * B) ) ] * - DO K = N, 1, -1 - KP = IPIV( K ) - IF( KP.NE.K ) - $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) - END DO + CALL ZTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), + $ LDA, B( 2, 1 ), LDB) +* +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* + DO K = N, 1, -1 + KP = IPIV( K ) + IF( KP.NE.K ) + $ CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB ) + END DO + END IF * END IF * diff --git a/lapack-netlib/SRC/zsytrs_aa_2stage.f b/lapack-netlib/SRC/zsytrs_aa_2stage.f index fa15eee90..bf060b2d3 100644 --- a/lapack-netlib/SRC/zsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrs_aa_2stage.f @@ -36,7 +36,7 @@ *> \verbatim *> *> ZSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a complex -*> symmetric matrix A using the factorization A = U*T*U**T or +*> symmetric matrix A using the factorization A = U**T*T*U or *> A = L*T*L**T computed by ZSYTRF_AA_2STAGE. *> \endverbatim * @@ -48,7 +48,7 @@ *> UPLO is CHARACTER*1 *> Specifies whether the details of the factorization are stored *> as an upper or lower triangular matrix. -*> = 'U': Upper triangular, form is A = U*T*U**T; +*> = 'U': Upper triangular, form is A = U**T*T*U; *> = 'L': Lower triangular, form is A = L*T*L**T. *> \endverbatim *> @@ -208,15 +208,15 @@ * IF( UPPER ) THEN * -* Solve A*X = B, where A = U*T*U**T. +* Solve A*X = B, where A = U**T*T*U. * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (U**T \P**T * B) -> B [ (U**T \P**T * B) ] +* Compute (U**T \ B) -> B [ (U**T \P**T * B) ] * CALL ZTRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) @@ -234,7 +234,7 @@ CALL ZTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (U \ (T \ (U**T \P**T * B) )) ] +* Pivot, P * B -> B [ P * (U \ (T \ (U**T \P**T * B) )) ] * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * @@ -246,11 +246,11 @@ * IF( N.GT.NB ) THEN * -* Pivot, P**T * B +* Pivot, P**T * B -> B * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 ) * -* Compute (L \P**T * B) -> B [ (L \P**T * B) ] +* Compute (L \ B) -> B [ (L \P**T * B) ] * CALL ZTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) @@ -268,7 +268,7 @@ CALL ZTRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1), $ LDA, B(NB+1, 1), LDB) * -* Pivot, P * B [ P * (L**T \ (T \ (L \P**T * B) )) ] +* Pivot, P * B -> B [ P * (L**T \ (T \ (L \P**T * B) )) ] * CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 ) * diff --git a/lapack-netlib/SRC/ztgsy2.f b/lapack-netlib/SRC/ztgsy2.f index f89effd6c..028ddfd3d 100644 --- a/lapack-netlib/SRC/ztgsy2.f +++ b/lapack-netlib/SRC/ztgsy2.f @@ -67,7 +67,7 @@ *> R * B**H + L * E**H = scale * -F *> *> This case is used to compute an estimate of Dif[(A, D), (B, E)] = -*> = sigma_min(Z) using reverse communicaton with ZLACON. +*> = sigma_min(Z) using reverse communication with ZLACON. *> *> ZTGSY2 also (IJOB >= 1) contributes to the computation in ZTGSYL *> of an upper bound on the separation between to matrix pairs. Then @@ -81,7 +81,7 @@ *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 -*> = 'N', solve the generalized Sylvester equation (1). +*> = 'N': solve the generalized Sylvester equation (1). *> = 'T': solve the 'transposed' system (3). *> \endverbatim *> diff --git a/lapack-netlib/SRC/ztpmlqt.f b/lapack-netlib/SRC/ztpmlqt.f index 6a67e4443..cc333f5a2 100644 --- a/lapack-netlib/SRC/ztpmlqt.f +++ b/lapack-netlib/SRC/ztpmlqt.f @@ -94,7 +94,7 @@ *> *> \param[in] V *> \verbatim -*> V is COMPLEX*16 array, dimension (LDA,K) +*> V is COMPLEX*16 array, dimension (LDV,K) *> The i-th row must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> DTPLQT in B. See Further Details. diff --git a/lapack-netlib/SRC/ztpmqrt.f b/lapack-netlib/SRC/ztpmqrt.f index aca7ff00f..530dca458 100644 --- a/lapack-netlib/SRC/ztpmqrt.f +++ b/lapack-netlib/SRC/ztpmqrt.f @@ -94,7 +94,7 @@ *> *> \param[in] V *> \verbatim -*> V is COMPLEX*16 array, dimension (LDA,K) +*> V is COMPLEX*16 array, dimension (LDV,K) *> The i-th column must contain the vector which defines the *> elementary reflector H(i), for i = 1,2,...,k, as returned by *> CTPQRT in B. See Further Details. diff --git a/lapack-netlib/SRC/ztprfb.f b/lapack-netlib/SRC/ztprfb.f index 1a62829d5..f96c237ee 100644 --- a/lapack-netlib/SRC/ztprfb.f +++ b/lapack-netlib/SRC/ztprfb.f @@ -152,8 +152,8 @@ *> \verbatim *> LDA is INTEGER *> The leading dimension of the array A. -*> If SIDE = 'L', LDC >= max(1,K); -*> If SIDE = 'R', LDC >= max(1,M). +*> If SIDE = 'L', LDA >= max(1,K); +*> If SIDE = 'R', LDA >= max(1,M). *> \endverbatim *> *> \param[in,out] B diff --git a/lapack-netlib/SRC/zungtsqr.f b/lapack-netlib/SRC/zungtsqr.f new file mode 100644 index 000000000..7b04e9a29 --- /dev/null +++ b/lapack-netlib/SRC/zungtsqr.f @@ -0,0 +1,307 @@ +*> \brief \b ZUNGTSQR +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZUNGTSQR + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE ZUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, +* $ INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGTSQR generates an M-by-N complex matrix Q_out with orthonormal +*> columns, which are the first N columns of a product of comlpex unitary +*> matrices of order M which are returned by ZLATSQR +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> See the documentation for ZLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by DLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by ZLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not accessed. +*> The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by ZLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored) (same format as the output A +*> below the diagonal in ZLATSQR). +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of NIRB block reflector sequences +*> is stored in a larger NB-by-N column block of T and consists +*> of NICB smaller NB-by-NB upper-triangular column blocks. +*> (same format as the output T in ZLATSQR). +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB1,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX*16 array, dimension (MAX(2,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. LWORK >= (M+NB)*N. +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup comlex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZLAMTSQR, ZLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + LQUERY = LWORK.EQ.-1 + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array C(LDC, N) and WORK(LWORK) +* in the call to ZLAMTSQR. See the documentation for ZLAMTSQR. +* + IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN + INFO = -10 + ELSE +* +* Set block size for column blocks +* + NBLOCAL = MIN( NB, N ) +* +* LWORK = -1, then set the size for the array C(LDC,N) +* in ZLAMTSQR call and set the optimal size of the work array +* WORK(LWORK) in ZLAMTSQR call. +* + LDC = M + LC = LDC*N + LW = N * NBLOCAL +* + LWORKOPT = LC+LW +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -10 + END IF + END IF +* + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZUNGTSQR', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in +* of M-by-M orthogonal matrix Q_in, which is implicitly stored in +* the subdiagonal part of input array A and in the input array T. +* Perform by the following operation using the routine ZLAMTSQR. +* +* Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix, +* ( 0 ) 0 is a (M-N)-by-N zero matrix. +* +* (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones +* on the diagonal and zeros elsewhere. +* + CALL ZLASET( 'F', M, N, CZERO, CONE, WORK, LDC ) +* +* (1b) On input, WORK(1:LDC*N) stores ( I ); +* ( 0 ) +* +* On output, WORK(1:LDC*N) stores Q1_in. +* + CALL ZLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT, + $ WORK, LDC, WORK( LC+1 ), LW, IINFO ) +* +* (2) Copy the result from the part of the work array (1:M,1:N) +* with the leading dimension LDC that starts at WORK(1) into +* the output array A(1:M,1:N) column-by-column. +* + DO J = 1, N + CALL ZCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 ) + END DO +* + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN +* +* End of ZUNGTSQR +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/zunhr_col.f b/lapack-netlib/SRC/zunhr_col.f new file mode 100644 index 000000000..71039fddb --- /dev/null +++ b/lapack-netlib/SRC/zunhr_col.f @@ -0,0 +1,441 @@ +*> \brief \b ZUNHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZUNHR_COL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> +* Definition: +* =========== +* +* SUBROUTINE ZUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNHR_COL takes an M-by-N complex matrix Q_in with orthonormal columns +*> as input, stored in A, and performs Householder Reconstruction (HR), +*> i.e. reconstructs Householder vectors V(i) implicitly representing +*> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S, +*> where S is an N-by-N diagonal matrix with diagonal entries +*> equal to +1 or -1. The Householder vectors (columns V(i) of V) are +*> stored in A on output, and the diagonal entries of S are stored in D. +*> Block reflectors are also returned in T +*> (same output format as ZGEQRT). +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size to be used in the reconstruction +*> of Householder column vector blocks in the array A and +*> corresponding block reflectors in the array T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size.) +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> +*> The array A contains an M-by-N orthonormal matrix Q_in, +*> i.e the columns of A are orthogonal unit vectors. +*> +*> On exit: +*> +*> The elements below the diagonal of A represent the unit +*> lower-trapezoidal matrix V of Householder column vectors +*> V(i). The unit diagonal entries of V are not stored +*> (same format as the output below the diagonal in A from +*> ZGEQRT). The matrix T and the matrix V stored on output +*> in A implicitly define Q_out. +*> +*> The elements above the diagonal contain the factor U +*> of the "modified" LU-decomposition: +*> Q_in - ( S ) = V * U +*> ( 0 ) +*> where 0 is a (M-N)-by-(M-N) zero matrix. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX*16 array, +*> dimension (LDT, N) +*> +*> Let NOCB = Number_of_output_col_blocks +*> = CEIL(N/NB) +*> +*> On exit, T(1:NB, 1:N) contains NOCB upper-triangular +*> block reflectors used to define Q_out stored in compact +*> form as a sequence of upper-triangular NB-by-NB column +*> blocks (same format as the output T in ZGEQRT). +*> The matrix T and the matrix V stored on output in A +*> implicitly define Q_out. NOTE: The lower triangles +*> below the upper-triangular blcoks will be filled with +*> zeros. See Further Details. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] D +*> \verbatim +*> D is COMPLEX*16 array, dimension min(M,N). +*> The elements can be only plus or minus one. +*> +*> D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where +*> 1 <= i <= min(M,N), and Q_in_i is Q_in after performing +*> i-1 steps of “modified” Gaussian elimination. +*> See Further Details. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The computed M-by-M unitary factor Q_out is defined implicitly as +*> a product of unitary matrices Q_out(i). Each Q_out(i) is stored in +*> the compact WY-representation format in the corresponding blocks of +*> matrices V (stored in A) and T. +*> +*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N +*> matrix A contains the column vectors V(i) in NB-size column +*> blocks VB(j). For example, VB(1) contains the columns +*> V(1), V(2), ... V(NB). NOTE: The unit entries on +*> the diagonal of Y are not stored in A. +*> +*> The number of column blocks is +*> +*> NOCB = Number_of_output_col_blocks = CEIL(N/NB) +*> +*> where each block is of order NB except for the last block, which +*> is of order LAST_NB = N - (NOCB-1)*NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix V is +*> +*> +*> V = ( VB(1), VB(2), VB(3) ) = +*> +*> = ( 1 ) +*> ( v21 1 ) +*> ( v31 v32 1 ) +*> ( v41 v42 v43 1 ) +*> ( v51 v52 v53 v54 1 ) +*> ( v61 v62 v63 v54 v65 ) +*> +*> +*> For each of the column blocks VB(i), an upper-triangular block +*> reflector TB(i) is computed. These blocks are stored as +*> a sequence of upper-triangular column blocks in the NB-by-N +*> matrix T. The size of each TB(i) block is NB-by-NB, except +*> for the last block, whose size is LAST_NB-by-LAST_NB. +*> +*> For example, if M=6, N=5 and NB=2, the matrix T is +*> +*> T = ( TB(1), TB(2), TB(3) ) = +*> +*> = ( t11 t12 t13 t14 t15 ) +*> ( t22 t24 ) +*> +*> +*> The M-by-M factor Q_out is given as a product of NOCB +*> unitary M-by-M matrices Q_out(i). +*> +*> Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB), +*> +*> where each matrix Q_out(i) is given by the WY-representation +*> using corresponding blocks from the matrices V and T: +*> +*> Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T, +*> +*> where I is the identity matrix. Here is the formula with matrix +*> dimensions: +*> +*> Q(i){M-by-M} = I{M-by-M} - +*> VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M}, +*> +*> where INB = NB, except for the last block NOCB +*> for which INB=LAST_NB. +*> +*> ===== +*> NOTE: +*> ===== +*> +*> If Q_in is the result of doing a QR factorization +*> B = Q_in * R_in, then: +*> +*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out. +*> +*> So if one wants to interpret Q_out as the result +*> of the QR factorization of B, then corresponding R_out +*> should be obtained by R_out = S * R_in, i.e. some rows of R_in +*> should be multiplied by -1. +*> +*> For the details of the algorithm, see [1]. +*> +*> [1] "Reconstructing Householder vectors from tall-skinny QR", +*> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen, +*> E. Solomonik, J. Parallel Distrib. Comput., +*> vol. 85, pp. 3-31, 2015. +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2019, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, M, N, NB +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), D( * ), T( LDT, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB, + $ NPLUSONE +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZLAUNHR_COL_GETRFNP, ZSCAL, ZTRSM, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. N.GT.M ) THEN + INFO = -2 + ELSE IF( NB.LT.1 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -5 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -7 + END IF +* +* Handle error in the input parameters. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZUNHR_COL', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + RETURN + END IF +* +* On input, the M-by-N matrix A contains the unitary +* M-by-N matrix Q_in. +* +* (1) Compute the unit lower-trapezoidal V (ones on the diagonal +* are not stored) by performing the "modified" LU-decomposition. +* +* Q_in - ( S ) = V * U = ( V1 ) * U, +* ( 0 ) ( V2 ) +* +* where 0 is an (M-N)-by-N zero matrix. +* +* (1-1) Factor V1 and U. + + CALL ZLAUNHR_COL_GETRFNP( N, N, A, LDA, D, IINFO ) +* +* (1-2) Solve for V2. +* + IF( M.GT.N ) THEN + CALL ZTRSM( 'R', 'U', 'N', 'N', M-N, N, CONE, A, LDA, + $ A( N+1, 1 ), LDA ) + END IF +* +* (2) Reconstruct the block reflector T stored in T(1:NB, 1:N) +* as a sequence of upper-triangular blocks with NB-size column +* blocking. +* +* Loop over the column blocks of size NB of the array A(1:M,1:N) +* and the array T(1:NB,1:N), JB is the column index of a column +* block, JNB is the column block size at each step JB. +* + NPLUSONE = N + 1 + DO JB = 1, N, NB +* +* (2-0) Determine the column block size JNB. +* + JNB = MIN( NPLUSONE-JB, NB ) +* +* (2-1) Copy the upper-triangular part of the current JNB-by-JNB +* diagonal block U(JB) (of the N-by-N matrix U) stored +* in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part +* of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1) +* column-by-column, total JNB*(JNB+1)/2 elements. +* + JBTEMP1 = JB - 1 + DO J = JB, JB+JNB-1 + CALL ZCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 ) + END DO +* +* (2-2) Perform on the upper-triangular part of the current +* JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored +* in T(1:JNB,JB:JB+JNB-1) the following operation in place: +* (-1)*U(JB)*S(JB), i.e the result will be stored in the upper- +* triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication +* of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB +* diagonal block S(JB) of the N-by-N sign matrix S from the +* right means changing the sign of each J-th column of the block +* U(JB) according to the sign of the diagonal element of the block +* S(JB), i.e. S(J,J) that is stored in the array element D(J). +* + DO J = JB, JB+JNB-1 + IF( D( J ).EQ.CONE ) THEN + CALL ZSCAL( J-JBTEMP1, -CONE, T( 1, J ), 1 ) + END IF + END DO +* +* (2-3) Perform the triangular solve for the current block +* matrix X(JB): +* +* X(JB) * (A(JB)**T) = B(JB), where: +* +* A(JB)**T is a JNB-by-JNB unit upper-triangular +* coefficient block, and A(JB)=V1(JB), which +* is a JNB-by-JNB unit lower-triangular block +* stored in A(JB:JB+JNB-1,JB:JB+JNB-1). +* The N-by-N matrix V1 is the upper part +* of the M-by-N lower-trapezoidal matrix V +* stored in A(1:M,1:N); +* +* B(JB) is a JNB-by-JNB upper-triangular right-hand +* side block, B(JB) = (-1)*U(JB)*S(JB), and +* B(JB) is stored in T(1:JNB,JB:JB+JNB-1); +* +* X(JB) is a JNB-by-JNB upper-triangular solution +* block, X(JB) is the upper-triangular block +* reflector T(JB), and X(JB) is stored +* in T(1:JNB,JB:JB+JNB-1). +* +* In other words, we perform the triangular solve for the +* upper-triangular block T(JB): +* +* T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB). +* +* Even though the blocks X(JB) and B(JB) are upper- +* triangular, the routine ZTRSM will access all JNB**2 +* elements of the square T(1:JNB,JB:JB+JNB-1). Therefore, +* we need to set to zero the elements of the block +* T(1:JNB,JB:JB+JNB-1) below the diagonal before the call +* to ZTRSM. +* +* (2-3a) Set the elements to zero. +* + JBTEMP2 = JB - 2 + DO J = JB, JB+JNB-2 + DO I = J-JBTEMP2, NB + T( I, J ) = CZERO + END DO + END DO +* +* (2-3b) Perform the triangular solve. +* + CALL ZTRSM( 'R', 'L', 'C', 'U', JNB, JNB, CONE, + $ A( JB, JB ), LDA, T( 1, JB ), LDT ) +* + END DO +* + RETURN +* +* End of ZUNHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index ec3d85221..d5ca95013 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -161,7 +161,7 @@ endif() # Only run this test if python 2.7 or greater is found if(PYTHONINTERP_FOUND) message(STATUS "Running Summary") - execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${LAPACK_SOURCE_DIR}/lapack_testing.py ${LAPACK_BINARY_DIR}) + file(COPY ${LAPACK_SOURCE_DIR}/lapack_testing.py DESTINATION ${LAPACK_BINARY_DIR}) add_test( NAME LAPACK_Test_Summary WORKING_DIRECTORY ${LAPACK_BINARY_DIR} diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 78046125a..b3efebcd0 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -1,5 +1,3 @@ -include ../../make.inc - ######################################################################## # This is the makefile for the eigenvalue test program from LAPACK. # The test files are organized as follows: @@ -33,6 +31,9 @@ include ../../make.inc # ######################################################################## +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc + AEIGTST = \ alahdg.o \ alasum.o \ @@ -117,24 +118,26 @@ ZEIGTST = zchkee.o \ zsgt01.o zslect.o \ zstt21.o zstt22.o zunt01.o zunt03.o +.PHONY: all all: single complex double complex16 +.PHONY: single complex double complex16 single: xeigtsts complex: xeigtstc double: xeigtstd complex16: xeigtstz -xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(AEIGTST): $(FRC) $(SCIGTST): $(FRC) @@ -147,6 +150,7 @@ $(ZEIGTST): $(FRC) FRC: @FRC=$(FRC) +.PHONY: clean cleanobj cleanexe clean: cleanobj cleanexe cleanobj: rm -f *.o @@ -154,13 +158,10 @@ cleanexe: rm -f xeigtst* schkee.o: schkee.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< dchkee.o: dchkee.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< cchkee.o: cchkee.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< zchkee.o: zchkee.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< diff --git a/lapack-netlib/TESTING/EIG/cbdt05.f b/lapack-netlib/TESTING/EIG/cbdt05.f index 192a8d0b6..5a08ccce3 100644 --- a/lapack-netlib/TESTING/EIG/cbdt05.f +++ b/lapack-netlib/TESTING/EIG/cbdt05.f @@ -52,6 +52,7 @@ *> \verbatim *> A is COMPLEX array, dimension (LDA,N) *> The m by n matrix A. +*> \endverbatim *> *> \param[in] LDA *> \verbatim diff --git a/lapack-netlib/TESTING/EIG/cchkst.f b/lapack-netlib/TESTING/EIG/cchkst.f index 471fe9c92..2d25f3fb1 100644 --- a/lapack-netlib/TESTING/EIG/cchkst.f +++ b/lapack-netlib/TESTING/EIG/cchkst.f @@ -167,7 +167,7 @@ *> CSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because CSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z* | / ( |S| n ulp ) CSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f index df610c207..5c478577f 100644 --- a/lapack-netlib/TESTING/EIG/cchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f @@ -188,7 +188,7 @@ *> CSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because CSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z* | / ( |S| n ulp ) CSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/cdrgsx.f b/lapack-netlib/TESTING/EIG/cdrgsx.f index 4e0f8b468..746946d07 100644 --- a/lapack-netlib/TESTING/EIG/cdrgsx.f +++ b/lapack-netlib/TESTING/EIG/cdrgsx.f @@ -737,7 +737,7 @@ CALL CLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA ) CALL CLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA ) * -* Compute the Schur factorization while swaping the +* Compute the Schur factorization while swapping the * m-by-m (1,1)-blocks with n-by-n (2,2)-blocks. * CALL CGGESX( 'V', 'V', 'S', CLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA, diff --git a/lapack-netlib/TESTING/EIG/cdrvbd.f b/lapack-netlib/TESTING/EIG/cdrvbd.f index 64bed3b13..7b7b01b47 100644 --- a/lapack-netlib/TESTING/EIG/cdrvbd.f +++ b/lapack-netlib/TESTING/EIG/cdrvbd.f @@ -33,8 +33,9 @@ *> *> \verbatim *> -*> CDRVBD checks the singular value decomposition (SVD) driver CGESVD -*> and CGESDD. +*> CDRVBD checks the singular value decomposition (SVD) driver CGESVD, +*> CGESDD, CGESVJ, CGEJSV, CGESVDX, and CGESVDQ. +*> *> CGESVD and CGESDD factors A = U diag(S) VT, where U and VT are *> unitary and diag(S) is diagonal with the entries of the array S on *> its diagonal. The entries of S are the singular values, nonnegative @@ -73,81 +74,92 @@ *> *> Test for CGESDD: *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (8) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (9) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (10) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (11) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> -*> (5) | U - Upartial | / ( M ulp ) where Upartial is a partially +*> (12) | U - Upartial | / ( M ulp ) where Upartial is a partially *> computed U. *> -*> (6) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially +*> (13) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially *> computed VT. *> -*> (7) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the +*> (14) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the *> vector of singular values from the partial SVD *> +*> Test for CGESVDQ: +*> +*> (36) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> +*> (37) | I - U'U | / ( M ulp ) +*> +*> (38) | I - VT VT' | / ( N ulp ) +*> +*> (39) S contains MNMIN nonnegative values in decreasing order. +*> (Return 0 if true, 1/ULP if false.) +*> *> Test for CGESVJ: *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (15) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (16) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (17) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (18) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> *> Test for CGEJSV: *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (19) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (20) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (21) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (22) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> *> Test for CGESVDX( 'V', 'V', 'A' )/CGESVDX( 'N', 'N', 'A' ) *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (23) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (24) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (25) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (26) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> -*> (5) | U - Upartial | / ( M ulp ) where Upartial is a partially +*> (27) | U - Upartial | / ( M ulp ) where Upartial is a partially *> computed U. *> -*> (6) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially +*> (28) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially *> computed VT. *> -*> (7) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the +*> (29) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the *> vector of singular values from the partial SVD *> *> Test for CGESVDX( 'V', 'V', 'I' ) *> -*> (8) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) +*> (30) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) *> -*> (9) | I - U'U | / ( M ulp ) +*> (31) | I - U'U | / ( M ulp ) *> -*> (10) | I - VT VT' | / ( N ulp ) +*> (32) | I - VT VT' | / ( N ulp ) *> *> Test for CGESVDX( 'V', 'V', 'V' ) *> -*> (11) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) +*> (33) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) *> -*> (12) | I - U'U | / ( M ulp ) +*> (34) | I - U'U | / ( M ulp ) *> -*> (13) | I - VT VT' | / ( N ulp ) +*> (35) | I - VT VT' | / ( N ulp ) *> *> The "sizes" are specified by the arrays MM(1:NSIZES) and *> NN(1:NSIZES); the value of each element pair (MM(j),NN(j)) @@ -393,6 +405,8 @@ * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * June 2016 +* + IMPLICIT NONE * * .. Scalar Arguments .. INTEGER INFO, LDA, LDU, LDVT, LWORK, NOUNIT, NSIZES, @@ -411,7 +425,7 @@ * ===================================================================== * * .. Parameters .. - REAL ZERO, ONE, TWO, HALF + REAL ZERO, ONE, TWO, HALF PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0, TWO = 2.0E0, $ HALF = 0.5E0 ) COMPLEX CZERO, CONE @@ -431,10 +445,13 @@ REAL ANORM, DIF, DIV, OVFL, RTUNFL, ULP, ULPINV, $ UNFL, VL, VU * .. +* .. Local Scalars for CGESVDQ .. + INTEGER LIWORK, NUMRANK +* .. * .. Local Arrays .. CHARACTER CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 ) INTEGER IOLDSD( 4 ), ISEED2( 4 ) - REAL RESULT( 35 ) + REAL RESULT( 39 ) * .. * .. External Functions .. REAL SLAMCH, SLARND @@ -442,8 +459,8 @@ * .. * .. External Subroutines .. EXTERNAL ALASVM, XERBLA, CBDT01, CBDT05, CGESDD, - $ CGESVD, CGESVJ, CGEJSV, CGESVDX, CLACPY, - $ CLASET, CLATMS, CUNT01, CUNT03 + $ CGESVD, CGESVDQ, CGESVJ, CGEJSV, CGESVDX, + $ CLACPY, CLASET, CLATMS, CUNT01, CUNT03 * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, MAX, MIN @@ -838,8 +855,64 @@ 130 CONTINUE * -* Test CGESVJ: Factorize A -* Note: CGESVJ does not work for M < N +* Test CGESVDQ +* Note: CGESVDQ only works for M >= N +* + RESULT( 36 ) = ZERO + RESULT( 37 ) = ZERO + RESULT( 38 ) = ZERO + RESULT( 39 ) = ZERO +* + IF( M.GE.N ) THEN + IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + IF( IWSPC.EQ.4 ) + $ LSWORK = LWORK +* + CALL CLACPY( 'F', M, N, ASAV, LDA, A, LDA ) + SRNAMT = 'CGESVDQ' +* + LRWORK = MAX(2, M, 5*N) + LIWORK = MAX( N, 1 ) + CALL CGESVDQ( 'H', 'N', 'N', 'A', 'A', + $ M, N, A, LDA, SSAV, USAV, LDU, + $ VTSAV, LDVT, NUMRANK, IWORK, LIWORK, + $ WORK, LWORK, RWORK, LRWORK, IINFO ) +* + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9995 )'CGESVDQ', IINFO, M, N, + $ JTYPE, LSWORK, IOLDSD + INFO = ABS( IINFO ) + RETURN + END IF +* +* Do tests 36--39 +* + CALL CBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E, + $ VTSAV, LDVT, WORK, RWORK, RESULT( 36 ) ) + IF( M.NE.0 .AND. N.NE.0 ) THEN + CALL CUNT01( 'Columns', M, M, USAV, LDU, WORK, + $ LWORK, RWORK, RESULT( 37 ) ) + CALL CUNT01( 'Rows', N, N, VTSAV, LDVT, WORK, + $ LWORK, RWORK, RESULT( 38 ) ) + END IF + RESULT( 39 ) = ZERO + DO 199 I = 1, MNMIN - 1 + IF( SSAV( I ).LT.SSAV( I+1 ) ) + $ RESULT( 39 ) = ULPINV + IF( SSAV( I ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + 199 CONTINUE + IF( MNMIN.GE.1 ) THEN + IF( SSAV( MNMIN ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + END IF + END IF +* +* Test CGESVJ +* Note: CGESVJ only works for M >= N * RESULT( 15 ) = ZERO RESULT( 16 ) = ZERO @@ -847,13 +920,13 @@ RESULT( 18 ) = ZERO * IF( M.GE.N ) THEN - IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) - LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 - LSWORK = MIN( LSWORK, LWORK ) - LSWORK = MAX( LSWORK, 1 ) - LRWORK = MAX(6,N) - IF( IWSPC.EQ.4 ) - $ LSWORK = LWORK + IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + LRWORK = MAX(6,N) + IF( IWSPC.EQ.4 ) + $ LSWORK = LWORK * CALL CLACPY( 'F', M, N, ASAV, LDA, USAV, LDA ) SRNAMT = 'CGESVJ' @@ -861,8 +934,7 @@ & 0, A, LDVT, WORK, LWORK, RWORK, & LRWORK, IINFO ) * -* CGESVJ retuns V not VT, so we transpose to use the same -* test suite. +* CGESVJ returns V not VH * DO J=1,N DO I=1,N @@ -900,31 +972,30 @@ END IF END IF * -* Test CGEJSV: Factorize A -* Note: CGEJSV does not work for M < N +* Test CGEJSV +* Note: CGEJSV only works for M >= N * RESULT( 19 ) = ZERO RESULT( 20 ) = ZERO RESULT( 21 ) = ZERO RESULT( 22 ) = ZERO IF( M.GE.N ) THEN - IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) - LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 - LSWORK = MIN( LSWORK, LWORK ) - LSWORK = MAX( LSWORK, 1 ) - IF( IWSPC.EQ.4 ) - $ LSWORK = LWORK - LRWORK = MAX( 7, N + 2*M) -* - CALL CLACPY( 'F', M, N, ASAV, LDA, VTSAV, LDA ) + IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + IF( IWSPC.EQ.4 ) + $ LSWORK = LWORK + LRWORK = MAX( 7, N + 2*M) +* + CALL CLACPY( 'F', M, N, ASAV, LDA, VTSAV, LDA ) SRNAMT = 'CGEJSV' CALL CGEJSV( 'G', 'U', 'V', 'R', 'N', 'N', & M, N, VTSAV, LDA, SSAV, USAV, LDU, A, LDVT, & WORK, LWORK, RWORK, & LRWORK, IWORK, IINFO ) * -* CGEJSV retuns V not VT, so we transpose to use the same -* test suite. +* CGEJSV returns V not VH * DO 133 J=1,N DO 132 I=1,N @@ -933,7 +1004,7 @@ 133 END DO * IF( IINFO.NE.0 ) THEN - WRITE( NOUNIT, FMT = 9995 )'GESVJ', IINFO, M, N, + WRITE( NOUNIT, FMT = 9995 )'GEJSV', IINFO, M, N, $ JTYPE, LSWORK, IOLDSD INFO = ABS( IINFO ) RETURN @@ -1160,7 +1231,7 @@ * NTEST = 0 NFAIL = 0 - DO 190 J = 1, 35 + DO 190 J = 1, 39 IF( RESULT( J ).GE.ZERO ) $ NTEST = NTEST + 1 IF( RESULT( J ).GE.THRESH ) @@ -1175,7 +1246,7 @@ NTESTF = 2 END IF * - DO 200 J = 1, 35 + DO 200 J = 1, 39 IF( RESULT( J ).GE.THRESH ) THEN WRITE( NOUNIT, FMT = 9997 )M, N, JTYPE, IWSPC, $ IOLDSD, J, RESULT( J ) @@ -1251,6 +1322,12 @@ $ / '33 = | U**T A VT**T - diag(S) | / ( |A| max(M,N) ulp )', $ / '34 = | I - U**T U | / ( M ulp ) ', $ / '35 = | I - VT VT**T | / ( N ulp ) ', + $ ' CGESVDQ(H,N,N,A,A', + $ / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ', + $ / '37 = | I - U**T U | / ( M ulp ) ', + $ / '38 = | I - VT VT**T | / ( N ulp ) ', + $ / '39 = 0 if S contains min(M,N) nonnegative values in', + $ ' decreasing order, else 1/ulp', $ / / ) 9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1, $ ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 ) diff --git a/lapack-netlib/TESTING/EIG/cerred.f b/lapack-netlib/TESTING/EIG/cerred.f index f1670e983..a0ceff76e 100644 --- a/lapack-netlib/TESTING/EIG/cerred.f +++ b/lapack-netlib/TESTING/EIG/cerred.f @@ -36,6 +36,8 @@ *> CGEJSV compute SVD of an M-by-N matrix A where M >= N *> CGESVDX compute SVD of an M-by-N matrix A(by bisection *> and inverse iteration) +*> CGESVDQ compute SVD of an M-by-N matrix A(with a +*> QR-Preconditioned ) *> \endverbatim * * Arguments: @@ -101,7 +103,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, CGEES, CGEESX, CGEEV, CGEEVX, CGEJSV, - $ CGESDD, CGESVD + $ CGESDD, CGESVD, CGESVDX, CGESVDQ * .. * .. External Functions .. LOGICAL LSAMEN, CSLECT @@ -495,6 +497,61 @@ ELSE WRITE( NOUT, FMT = 9998 ) END IF +* +* Test CGESVDQ +* + SRNAMT = 'CGESVDQ' + INFOT = 1 + CALL CGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ -1, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, -1, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 17 + CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, 1, NS, IW, -5, W, 1, RW, 1, INFO ) + CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK ) + NT = 11 + IF( OK ) THEN + WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), + $ NT + ELSE + WRITE( NOUT, FMT = 9998 ) + END IF END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/cget51.f b/lapack-netlib/TESTING/EIG/cget51.f index ce1108aa4..ec58086d4 100644 --- a/lapack-netlib/TESTING/EIG/cget51.f +++ b/lapack-netlib/TESTING/EIG/cget51.f @@ -29,12 +29,13 @@ *> *> CGET51 generally checks a decomposition of the form *> -*> A = U B VC> -*> where * means conjugate transpose and U and V are unitary. +*> A = U B V**H +*> +*> where **H means conjugate transpose and U and V are unitary. *> *> Specifically, if ITYPE=1 *> -*> RESULT = | A - U B V* | / ( |A| n ulp ) +*> RESULT = | A - U B V**H | / ( |A| n ulp ) *> *> If ITYPE=2, then: *> @@ -42,7 +43,7 @@ *> *> If ITYPE=3, then: *> -*> RESULT = | I - UU* | / ( n ulp ) +*> RESULT = | I - U U**H | / ( n ulp ) *> \endverbatim * * Arguments: @@ -52,9 +53,9 @@ *> \verbatim *> ITYPE is INTEGER *> Specifies the type of tests to be performed. -*> =1: RESULT = | A - U B V* | / ( |A| n ulp ) +*> =1: RESULT = | A - U B V**H | / ( |A| n ulp ) *> =2: RESULT = | A - B | / ( |A| n ulp ) -*> =3: RESULT = | I - UU* | / ( n ulp ) +*> =3: RESULT = | I - U U**H | / ( n ulp ) *> \endverbatim *> *> \param[in] N @@ -218,7 +219,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: Compute W = A - UBV' +* ITYPE=1: Compute W = A - U B V**H * CALL CLACPY( ' ', N, N, A, LDA, WORK, N ) CALL CGEMM( 'N', 'N', N, N, N, CONE, U, LDU, B, LDB, CZERO, @@ -259,7 +260,7 @@ * * Tests not scaled by norm(A) * -* ITYPE=3: Compute UU' - I +* ITYPE=3: Compute U U**H - I * CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, $ WORK, N ) diff --git a/lapack-netlib/TESTING/EIG/chbt21.f b/lapack-netlib/TESTING/EIG/chbt21.f index 90ec74c23..76eb7d115 100644 --- a/lapack-netlib/TESTING/EIG/chbt21.f +++ b/lapack-netlib/TESTING/EIG/chbt21.f @@ -28,14 +28,16 @@ *> *> CHBT21 generally checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is hermitian banded, U is +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is hermitian banded, U is *> unitary, and S is diagonal (if KS=0) or symmetric *> tridiagonal (if KS=1). *> *> Specifically: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> \endverbatim * * Arguments: @@ -220,7 +222,7 @@ * ANORM = MAX( CLANHB( '1', CUPLO, N, IKA, A, LDA, RWORK ), UNFL ) * -* Compute error matrix: Error = A - U S U* +* Compute error matrix: Error = A - U S U**H * * Copy A from SB to SP storage format. * @@ -271,7 +273,7 @@ * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK, $ N ) diff --git a/lapack-netlib/TESTING/EIG/chet21.f b/lapack-netlib/TESTING/EIG/chet21.f index 5aff64904..d5c4f1348 100644 --- a/lapack-netlib/TESTING/EIG/chet21.f +++ b/lapack-netlib/TESTING/EIG/chet21.f @@ -29,8 +29,9 @@ *> *> CHET21 generally checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is hermitian, U is unitary, and +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is hermitian, U is unitary, and *> S is diagonal (if KBAND=0) or (real) symmetric tridiagonal (if *> KBAND=1). *> @@ -42,18 +43,19 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> *> For ITYPE > 1, the transformation U is expressed as a product -*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)C> and each +*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)**H and each *> vector v(j) has its first j elements 0 and the remaining n-j elements *> stored in V(j+1:n,j). *> \endverbatim @@ -66,14 +68,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense unitary matrix: -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> 3: U expressed both as a dense unitary matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -171,7 +174,7 @@ *> \verbatim *> TAU is COMPLEX array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)* in the Householder transformation H(j) of +*> v(j) v(j)**H in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -294,7 +297,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U* +* ITYPE=1: error = A - U S U**H * CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) CALL CLACPY( CUPLO, N, N, A, LDA, WORK, N ) @@ -304,7 +307,6 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN -CMK DO 20 J = 1, N - 1 DO 20 J = 2, N - 1 CALL CHER2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) @@ -314,7 +316,7 @@ CMK DO 20 J = 1, N - 1 * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V* - A +* ITYPE=2: error = V S V**H - A * CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) * @@ -371,7 +373,7 @@ CMK DO 20 J = 1, N - 1 * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V* - I +* ITYPE=3: error = U V**H - I * IF( N.LT.2 ) $ RETURN @@ -407,7 +409,7 @@ CMK DO 20 J = 1, N - 1 * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * IF( ITYPE.EQ.1 ) THEN CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, diff --git a/lapack-netlib/TESTING/EIG/chet22.f b/lapack-netlib/TESTING/EIG/chet22.f index 5087ecbca..354387f2a 100644 --- a/lapack-netlib/TESTING/EIG/chet22.f +++ b/lapack-netlib/TESTING/EIG/chet22.f @@ -42,7 +42,8 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC> RESULT(2) = | I - U'U | / ( m ulp ) +*> RESULT(1) = | U**H A U - S | / ( |A| m ulp ) and +*> RESULT(2) = | I - U**H U | / ( m ulp ) *> \endverbatim * * Arguments: @@ -52,7 +53,8 @@ *> ITYPE INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> UPLO CHARACTER *> If UPLO='U', the upper triangle of A will be used and the @@ -122,7 +124,7 @@ *> *> TAU COMPLEX array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**H in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> Not modified. @@ -215,7 +217,7 @@ * * Compute error matrix: * -* ITYPE=1: error = U' A U - S +* ITYPE=1: error = U**H A U - S * CALL CHEMM( 'L', UPLO, N, M, CONE, A, LDA, U, LDU, CZERO, WORK, $ N ) @@ -249,7 +251,7 @@ * * Do Test 2 * -* Compute U'U - I +* Compute U**H U - I * IF( ITYPE.EQ.1 ) $ CALL CUNT01( 'Columns', N, M, U, LDU, WORK, 2*N*N, RWORK, diff --git a/lapack-netlib/TESTING/EIG/chpt21.f b/lapack-netlib/TESTING/EIG/chpt21.f index e151a8bd8..f20921bd9 100644 --- a/lapack-netlib/TESTING/EIG/chpt21.f +++ b/lapack-netlib/TESTING/EIG/chpt21.f @@ -29,8 +29,9 @@ *> *> CHPT21 generally checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is hermitian, U is +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is hermitian, U is *> unitary, and S is diagonal (if KBAND=0) or (real) symmetric *> tridiagonal (if KBAND=1). If ITYPE=1, then U is represented as *> a dense matrix, otherwise the U is expressed as a product of @@ -41,15 +42,16 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> *> Packed storage means that, for example, if UPLO='U', then the columns *> of the upper triangle of A are stored one after another, so that @@ -70,14 +72,16 @@ *> *> If UPLO='U', then V = H(n-1)...H(1), where *> -*> H(j) = I - tau(j) v(j) v(j)C> +*> H(j) = I - tau(j) v(j) v(j)**H +*> *> and the first j-1 elements of v(j) are stored in V(1:j-1,j+1), *> (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ), *> the j-th element is 1, and the last n-j elements are 0. *> *> If UPLO='L', then V = H(1)...H(n-1), where *> -*> H(j) = I - tau(j) v(j) v(j)C> +*> H(j) = I - tau(j) v(j) v(j)**H +*> *> and the first j elements of v(j) are 0, the (j+1)-st is 1, and the *> (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e., *> in VP( (2*n-j)*(j-1)/2 + j+2 : (2*n-j)*(j-1)/2 + n ) .) @@ -91,14 +95,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense unitary matrix: -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> 3: U expressed both as a dense unitary matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -181,7 +186,7 @@ *> \verbatim *> TAU is COMPLEX array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)* in the Householder transformation H(j) of +*> v(j) v(j)**H in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -313,7 +318,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U* +* ITYPE=1: error = A - U S U**H * CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) CALL CCOPY( LAP, AP, 1, WORK, 1 ) @@ -332,7 +337,7 @@ * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V* - A +* ITYPE=2: error = V S V**H - A * CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) * @@ -400,7 +405,7 @@ * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V* - I +* ITYPE=3: error = U V**H - I * IF( N.LT.2 ) $ RETURN @@ -431,7 +436,7 @@ * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * IF( ITYPE.EQ.1 ) THEN CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, diff --git a/lapack-netlib/TESTING/EIG/cstt21.f b/lapack-netlib/TESTING/EIG/cstt21.f index 47d99ac49..3fdfa1675 100644 --- a/lapack-netlib/TESTING/EIG/cstt21.f +++ b/lapack-netlib/TESTING/EIG/cstt21.f @@ -28,14 +28,15 @@ *> *> CSTT21 checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is real symmetric tridiagonal, +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is real symmetric tridiagonal, *> U is unitary, and S is real and diagonal (if KBAND=0) or symmetric *> tridiagonal (if KBAND=1). Two tests are performed: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) *> -*> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> \endverbatim * * Arguments: @@ -201,7 +202,7 @@ WORK( N**2 ) = AD( N ) ANORM = MAX( ANORM, ABS( AD( N ) )+TEMP1, UNFL ) * -* Norm of A - USU* +* Norm of A - U S U**H * DO 20 J = 1, N CALL CHER( 'L', N, -SD( J ), U( 1, J ), 1, WORK, N ) @@ -228,7 +229,7 @@ * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK, $ N ) diff --git a/lapack-netlib/TESTING/EIG/dbdt05.f b/lapack-netlib/TESTING/EIG/dbdt05.f index 3580aec81..356bb5fc8 100644 --- a/lapack-netlib/TESTING/EIG/dbdt05.f +++ b/lapack-netlib/TESTING/EIG/dbdt05.f @@ -52,6 +52,7 @@ *> \verbatim *> A is DOUBLE PRECISION array, dimension (LDA,N) *> The m by n matrix A. +*> \endverbatim *> *> \param[in] LDA *> \verbatim diff --git a/lapack-netlib/TESTING/EIG/dchkst.f b/lapack-netlib/TESTING/EIG/dchkst.f index f08deb529..1b4d85f79 100644 --- a/lapack-netlib/TESTING/EIG/dchkst.f +++ b/lapack-netlib/TESTING/EIG/dchkst.f @@ -166,7 +166,7 @@ *> DSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because DSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z' | / ( |S| n ulp ) DSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f index fc015334d..ca31c9d1f 100644 --- a/lapack-netlib/TESTING/EIG/dchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f @@ -187,7 +187,7 @@ *> DSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because DSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z' | / ( |S| n ulp ) DSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/ddrgsx.f b/lapack-netlib/TESTING/EIG/ddrgsx.f index 44c36407f..7fe9dfc14 100644 --- a/lapack-netlib/TESTING/EIG/ddrgsx.f +++ b/lapack-netlib/TESTING/EIG/ddrgsx.f @@ -769,7 +769,7 @@ CALL DLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA ) CALL DLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA ) * -* Compute the Schur factorization while swaping the +* Compute the Schur factorization while swapping the * m-by-m (1,1)-blocks with n-by-n (2,2)-blocks. * CALL DGGESX( 'V', 'V', 'S', DLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA, diff --git a/lapack-netlib/TESTING/EIG/ddrvbd.f b/lapack-netlib/TESTING/EIG/ddrvbd.f index 868679052..bd4ae60da 100644 --- a/lapack-netlib/TESTING/EIG/ddrvbd.f +++ b/lapack-netlib/TESTING/EIG/ddrvbd.f @@ -32,7 +32,7 @@ *> \verbatim *> *> DDRVBD checks the singular value decomposition (SVD) drivers -*> DGESVD, DGESDD, DGESVJ, and DGEJSV. +*> DGESVD, DGESDD, DGESVDQ, DGESVJ, DGEJSV, and DGESVDX. *> *> Both DGESVD and DGESDD factor A = U diag(S) VT, where U and VT are *> orthogonal and diag(S) is diagonal with the entries of the array S @@ -90,6 +90,17 @@ *> (14) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the *> vector of singular values from the partial SVD *> +*> Test for DGESVDQ: +*> +*> (36) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> +*> (37) | I - U'U | / ( M ulp ) +*> +*> (38) | I - VT VT' | / ( N ulp ) +*> +*> (39) S contains MNMIN nonnegative values in decreasing order. +*> (Return 0 if true, 1/ULP if false.) +*> *> Test for DGESVJ: *> *> (15) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) @@ -354,6 +365,8 @@ SUBROUTINE DDRVBD( NSIZES, MM, NN, NTYPES, DOTYPE, ISEED, THRESH, $ A, LDA, U, LDU, VT, LDVT, ASAV, USAV, VTSAV, S, $ SSAV, E, WORK, LWORK, IWORK, NOUT, INFO ) +* + IMPLICIT NONE * * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -390,13 +403,19 @@ $ ITEMP, J, JSIZE, JTYPE, LSWORK, M, MINWRK, $ MMAX, MNMAX, MNMIN, MTYPES, N, NFAIL, $ NMAX, NS, NSI, NSV, NTEST - DOUBLE PRECISION ANORM, DIF, DIV, OVFL, RTUNFL, ULP, - $ ULPINV, UNFL, VL, VU + DOUBLE PRECISION ANORM, DIF, DIV, OVFL, RTUNFL, ULP, + $ ULPINV, UNFL, VL, VU +* .. +* .. Local Scalars for DGESVDQ .. + INTEGER LIWORK, LRWORK, NUMRANK +* .. +* .. Local Arrays for DGESVDQ .. + DOUBLE PRECISION RWORK( 2 ) * .. * .. Local Arrays .. CHARACTER CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 ) INTEGER IOLDSD( 4 ), ISEED2( 4 ) - DOUBLE PRECISION RESULT( 40 ) + DOUBLE PRECISION RESULT( 39 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLARND @@ -404,8 +423,8 @@ * .. * .. External Subroutines .. EXTERNAL ALASVM, DBDT01, DGEJSV, DGESDD, DGESVD, - $ DGESVDX, DGESVJ, DLABAD, DLACPY, DLASET, - $ DLATMS, DORT01, DORT03, XERBLA + $ DGESVDQ, DGESVDX, DGESVJ, DLABAD, DLACPY, + $ DLASET, DLATMS, DORT01, DORT03, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, MAX, MIN @@ -781,8 +800,64 @@ RESULT( 14 ) = MAX( RESULT( 14 ), DIF ) 110 CONTINUE * -* Test DGESVJ: Factorize A -* Note: DGESVJ does not work for M < N +* Test DGESVDQ +* Note: DGESVDQ only works for M >= N +* + RESULT( 36 ) = ZERO + RESULT( 37 ) = ZERO + RESULT( 38 ) = ZERO + RESULT( 39 ) = ZERO +* + IF( M.GE.N ) THEN + IWTMP = 5*MNMIN*MNMIN + 9*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWS-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + IF( IWS.EQ.4 ) + $ LSWORK = LWORK +* + CALL DLACPY( 'F', M, N, ASAV, LDA, A, LDA ) + SRNAMT = 'DGESVDQ' +* + LRWORK = 2 + LIWORK = MAX( N, 1 ) + CALL DGESVDQ( 'H', 'N', 'N', 'A', 'A', + $ M, N, A, LDA, SSAV, USAV, LDU, + $ VTSAV, LDVT, NUMRANK, IWORK, LIWORK, + $ WORK, LWORK, RWORK, LRWORK, IINFO ) +* + IF( IINFO.NE.0 ) THEN + WRITE( NOUT, FMT = 9995 )'DGESVDQ', IINFO, M, N, + $ JTYPE, LSWORK, IOLDSD + INFO = ABS( IINFO ) + RETURN + END IF +* +* Do tests 36--39 +* + CALL DBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E, + $ VTSAV, LDVT, WORK, RESULT( 36 ) ) + IF( M.NE.0 .AND. N.NE.0 ) THEN + CALL DORT01( 'Columns', M, M, USAV, LDU, WORK, + $ LWORK, RESULT( 37 ) ) + CALL DORT01( 'Rows', N, N, VTSAV, LDVT, WORK, + $ LWORK, RESULT( 38 ) ) + END IF + RESULT( 39 ) = ZERO + DO 199 I = 1, MNMIN - 1 + IF( SSAV( I ).LT.SSAV( I+1 ) ) + $ RESULT( 39 ) = ULPINV + IF( SSAV( I ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + 199 CONTINUE + IF( MNMIN.GE.1 ) THEN + IF( SSAV( MNMIN ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + END IF + END IF +* +* Test DGESVJ +* Note: DGESVJ only works for M >= N * RESULT( 15 ) = ZERO RESULT( 16 ) = ZERO @@ -802,8 +877,7 @@ CALL DGESVJ( 'G', 'U', 'V', M, N, USAV, LDA, SSAV, & 0, A, LDVT, WORK, LWORK, INFO ) * -* DGESVJ retuns V not VT, so we transpose to use the same -* test suite. +* DGESVJ returns V not VT * DO J=1,N DO I=1,N @@ -841,8 +915,8 @@ END IF END IF * -* Test DGEJSV: Factorize A -* Note: DGEJSV does not work for M < N +* Test DGEJSV +* Note: DGEJSV only works for M >= N * RESULT( 19 ) = ZERO RESULT( 20 ) = ZERO @@ -862,8 +936,7 @@ & M, N, VTSAV, LDA, SSAV, USAV, LDU, A, LDVT, & WORK, LWORK, IWORK, INFO ) * -* DGEJSV retuns V not VT, so we transpose to use the same -* test suite. +* DGEJSV returns V not VT * DO 140 J=1,N DO 130 I=1,N @@ -872,7 +945,7 @@ 140 END DO * IF( IINFO.NE.0 ) THEN - WRITE( NOUT, FMT = 9995 )'GESVJ', IINFO, M, N, + WRITE( NOUT, FMT = 9995 )'GEJSV', IINFO, M, N, $ JTYPE, LSWORK, IOLDSD INFO = ABS( IINFO ) RETURN @@ -1086,7 +1159,7 @@ * * End of Loop -- Check for RESULT(j) > THRESH * - DO 210 J = 1, 35 + DO 210 J = 1, 39 IF( RESULT( J ).GE.THRESH ) THEN IF( NFAIL.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) @@ -1097,7 +1170,7 @@ NFAIL = NFAIL + 1 END IF 210 CONTINUE - NTEST = NTEST + 35 + NTEST = NTEST + 39 220 CONTINUE 230 CONTINUE 240 CONTINUE @@ -1158,6 +1231,12 @@ $ ' DGESVDX(V,V,V) ', $ / '34 = | I - U**T U | / ( M ulp ) ', $ / '35 = | I - VT VT**T | / ( N ulp ) ', + $ ' DGESVDQ(H,N,N,A,A', + $ / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ', + $ / '37 = | I - U**T U | / ( M ulp ) ', + $ / '38 = | I - VT VT**T | / ( N ulp ) ', + $ / '39 = 0 if S contains min(M,N) nonnegative values in', + $ ' decreasing order, else 1/ulp', $ / / ) 9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1, $ ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 ) diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f index 5bde7f67d..94264e256 100644 --- a/lapack-netlib/TESTING/EIG/derred.f +++ b/lapack-netlib/TESTING/EIG/derred.f @@ -36,6 +36,8 @@ *> DGEJSV compute SVD of an M-by-N matrix A where M >= N *> DGESVDX compute SVD of an M-by-N matrix A(by bisection *> and inverse iteration) +*> DGESVDQ compute SVD of an M-by-N matrix A(with a +*> QR-Preconditioned ) *> \endverbatim * * Arguments: @@ -100,7 +102,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, DGEES, DGEESX, DGEEV, DGEEVX, DGEJSV, - $ DGESDD, DGESVD + $ DGESDD, DGESVD, DGESVDX, DGESVQ * .. * .. External Functions .. LOGICAL DSLECT, LSAMEN @@ -486,6 +488,61 @@ ELSE WRITE( NOUT, FMT = 9998 ) END IF +* +* Test DGESVDQ +* + SRNAMT = 'DGESVDQ' + INFOT = 1 + CALL DGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ -1, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, -1, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 17 + CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, 1, NS, IW, -5, W, 1, W, 1, INFO ) + CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK ) + NT = 11 + IF( OK ) THEN + WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), + $ NT + ELSE + WRITE( NOUT, FMT = 9998 ) + END IF END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/dget39.f b/lapack-netlib/TESTING/EIG/dget39.f index 1d0ec1f45..17e66c8e6 100644 --- a/lapack-netlib/TESTING/EIG/dget39.f +++ b/lapack-netlib/TESTING/EIG/dget39.f @@ -194,7 +194,7 @@ VM5( 2 ) = EPS VM5( 3 ) = SQRT( SMLNUM ) * -* Initalization +* Initialization * KNT = 0 RMAX = ZERO diff --git a/lapack-netlib/TESTING/EIG/dsbt21.f b/lapack-netlib/TESTING/EIG/dsbt21.f index e7db231a9..54795623b 100644 --- a/lapack-netlib/TESTING/EIG/dsbt21.f +++ b/lapack-netlib/TESTING/EIG/dsbt21.f @@ -28,15 +28,16 @@ *> *> DSBT21 generally checks a decomposition of the form *> -*> A = U S U' +*> A = U S U**T *> -*> where ' means transpose, A is symmetric banded, U is +*> where **T means transpose, A is symmetric banded, U is *> orthogonal, and S is diagonal (if KS=0) or symmetric *> tridiagonal (if KS=1). *> *> Specifically: *> -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> \endverbatim * * Arguments: @@ -214,7 +215,7 @@ * ANORM = MAX( DLANSB( '1', CUPLO, N, IKA, A, LDA, WORK ), UNFL ) * -* Compute error matrix: Error = A - U S U' +* Compute error matrix: Error = A - U S U**T * * Copy A from SB to SP storage format. * @@ -265,7 +266,7 @@ * * Do Test 2 * -* Compute UU' - I +* Compute U U**T - I * CALL DGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK, $ N ) diff --git a/lapack-netlib/TESTING/EIG/dspt21.f b/lapack-netlib/TESTING/EIG/dspt21.f index 9f87959fe..4b1d360c5 100644 --- a/lapack-netlib/TESTING/EIG/dspt21.f +++ b/lapack-netlib/TESTING/EIG/dspt21.f @@ -28,9 +28,9 @@ *> *> DSPT21 generally checks a decomposition of the form *> -*> A = U S U' +*> A = U S U**T *> -*> where ' means transpose, A is symmetric (stored in packed format), U +*> where **T means transpose, A is symmetric (stored in packed format), U *> is orthogonal, and S is diagonal (if KBAND=0) or symmetric *> tridiagonal (if KBAND=1). If ITYPE=1, then U is represented as a *> dense matrix, otherwise the U is expressed as a product of @@ -41,15 +41,16 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> *> Packed storage means that, for example, if UPLO='U', then the columns *> of the upper triangle of A are stored one after another, so that @@ -70,7 +71,7 @@ *> *> If UPLO='U', then V = H(n-1)...H(1), where *> -*> H(j) = I - tau(j) v(j) v(j)' +*> H(j) = I - tau(j) v(j) v(j)**T *> *> and the first j-1 elements of v(j) are stored in V(1:j-1,j+1), *> (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ), @@ -78,7 +79,7 @@ *> *> If UPLO='L', then V = H(1)...H(n-1), where *> -*> H(j) = I - tau(j) v(j) v(j)' +*> H(j) = I - tau(j) v(j) v(j)**T *> *> and the first j elements of v(j) are 0, the (j+1)-st is 1, and the *> (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e., @@ -93,14 +94,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> 3: U expressed both as a dense orthogonal matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -183,7 +185,7 @@ *> \verbatim *> TAU is DOUBLE PRECISION array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**T in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -303,7 +305,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U' +* ITYPE=1: error = A - U S U**T * CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) CALL DCOPY( LAP, AP, 1, WORK, 1 ) @@ -322,7 +324,7 @@ * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V' - A +* ITYPE=2: error = V S V**T - A * CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) * @@ -389,7 +391,7 @@ * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V' - I +* ITYPE=3: error = U V**T - I * IF( N.LT.2 ) $ RETURN @@ -420,7 +422,7 @@ * * Do Test 2 * -* Compute UU' - I +* Compute U U**T - I * IF( ITYPE.EQ.1 ) THEN CALL DGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK, diff --git a/lapack-netlib/TESTING/EIG/dsyt21.f b/lapack-netlib/TESTING/EIG/dsyt21.f index 0da3e5882..e00bd0db2 100644 --- a/lapack-netlib/TESTING/EIG/dsyt21.f +++ b/lapack-netlib/TESTING/EIG/dsyt21.f @@ -28,9 +28,9 @@ *> *> DSYT21 generally checks a decomposition of the form *> -*> A = U S U' +*> A = U S U**T *> -*> where ' means transpose, A is symmetric, U is orthogonal, and S is +*> where **T means transpose, A is symmetric, U is orthogonal, and S is *> diagonal (if KBAND=0) or symmetric tridiagonal (if KBAND=1). *> *> If ITYPE=1, then U is represented as a dense matrix; otherwise U is @@ -41,18 +41,19 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> *> For ITYPE > 1, the transformation U is expressed as a product -*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)' and each +*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)**T and each *> vector v(j) has its first j elements 0 and the remaining n-j elements *> stored in V(j+1:n,j). *> \endverbatim @@ -65,14 +66,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> 3: U expressed both as a dense orthogonal matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -170,7 +172,7 @@ *> \verbatim *> TAU is DOUBLE PRECISION array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**T in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -283,7 +285,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U' +* ITYPE=1: error = A - U S U**T * CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) CALL DLACPY( CUPLO, N, N, A, LDA, WORK, N ) @@ -302,7 +304,7 @@ * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V' - A +* ITYPE=2: error = V S V**T - A * CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) * @@ -359,7 +361,7 @@ * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V' - I +* ITYPE=3: error = U V**T - I * IF( N.LT.2 ) $ RETURN @@ -395,7 +397,7 @@ * * Do Test 2 * -* Compute UU' - I +* Compute U U**T - I * IF( ITYPE.EQ.1 ) THEN CALL DGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK, diff --git a/lapack-netlib/TESTING/EIG/dsyt22.f b/lapack-netlib/TESTING/EIG/dsyt22.f index 479b3ba5e..09e4aeb82 100644 --- a/lapack-netlib/TESTING/EIG/dsyt22.f +++ b/lapack-netlib/TESTING/EIG/dsyt22.f @@ -41,7 +41,8 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC> RESULT(2) = | I - U'U | / ( m ulp ) +*> RESULT(1) = | U**T A U - S | / ( |A| m ulp ) and +*> RESULT(2) = | I - U**T U | / ( m ulp ) *> \endverbatim * * Arguments: @@ -51,7 +52,8 @@ *> ITYPE INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> UPLO CHARACTER *> If UPLO='U', the upper triangle of A will be used and the @@ -122,7 +124,7 @@ *> *> TAU DOUBLE PRECISION array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**T in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> Not modified. @@ -207,7 +209,7 @@ * * Compute error matrix: * -* ITYPE=1: error = U' A U - S +* ITYPE=1: error = U**T A U - S * CALL DSYMM( 'L', UPLO, N, M, ONE, A, LDA, U, LDU, ZERO, WORK, N ) NN = N*N @@ -240,7 +242,7 @@ * * Do Test 2 * -* Compute U'U - I +* Compute U**T U - I * IF( ITYPE.EQ.1 ) $ CALL DORT01( 'Columns', N, M, U, LDU, WORK, 2*N*N, diff --git a/lapack-netlib/TESTING/EIG/sbdt05.f b/lapack-netlib/TESTING/EIG/sbdt05.f index 972ff952f..e3e79e91e 100644 --- a/lapack-netlib/TESTING/EIG/sbdt05.f +++ b/lapack-netlib/TESTING/EIG/sbdt05.f @@ -52,6 +52,7 @@ *> \verbatim *> A is REAL array, dimension (LDA,N) *> The m by n matrix A. +*> \endverbatim *> *> \param[in] LDA *> \verbatim diff --git a/lapack-netlib/TESTING/EIG/schkst.f b/lapack-netlib/TESTING/EIG/schkst.f index f4ae46832..a851bbbbf 100644 --- a/lapack-netlib/TESTING/EIG/schkst.f +++ b/lapack-netlib/TESTING/EIG/schkst.f @@ -166,7 +166,7 @@ *> SSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because SSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z' | / ( |S| n ulp ) SSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f index 1c18e21bc..f386ab43c 100644 --- a/lapack-netlib/TESTING/EIG/schkst2stg.f +++ b/lapack-netlib/TESTING/EIG/schkst2stg.f @@ -187,7 +187,7 @@ *> SSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because SSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z' | / ( |S| n ulp ) SSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/sdrgsx.f b/lapack-netlib/TESTING/EIG/sdrgsx.f index bb5af0fd6..58e63e793 100644 --- a/lapack-netlib/TESTING/EIG/sdrgsx.f +++ b/lapack-netlib/TESTING/EIG/sdrgsx.f @@ -770,7 +770,7 @@ c MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 ) CALL SLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA ) CALL SLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA ) * -* Compute the Schur factorization while swaping the +* Compute the Schur factorization while swapping the * m-by-m (1,1)-blocks with n-by-n (2,2)-blocks. * CALL SGGESX( 'V', 'V', 'S', SLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA, diff --git a/lapack-netlib/TESTING/EIG/sdrvbd.f b/lapack-netlib/TESTING/EIG/sdrvbd.f index b5d8a9b9a..101c8ba09 100644 --- a/lapack-netlib/TESTING/EIG/sdrvbd.f +++ b/lapack-netlib/TESTING/EIG/sdrvbd.f @@ -32,7 +32,7 @@ *> \verbatim *> *> SDRVBD checks the singular value decomposition (SVD) drivers -*> SGESVD, SGESDD, SGESVJ, and SGEJSV. +*> SGESVD, SGESDD, SGESVDQ, SGESVJ, SGEJSV, and DGESVDX. *> *> Both SGESVD and SGESDD factor A = U diag(S) VT, where U and VT are *> orthogonal and diag(S) is diagonal with the entries of the array S @@ -90,6 +90,17 @@ *> (14) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the *> vector of singular values from the partial SVD *> +*> Test for SGESVDQ: +*> +*> (36) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> +*> (37) | I - U'U | / ( M ulp ) +*> +*> (38) | I - VT VT' | / ( N ulp ) +*> +*> (39) S contains MNMIN nonnegative values in decreasing order. +*> (Return 0 if true, 1/ULP if false.) +*> *> Test for SGESVJ: *> *> (15) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) @@ -359,6 +370,8 @@ * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * June 2016 +* + IMPLICIT NONE * * .. Scalar Arguments .. INTEGER INFO, LDA, LDU, LDVT, LWORK, NOUT, NSIZES, @@ -391,12 +404,18 @@ $ MMAX, MNMAX, MNMIN, MTYPES, N, NFAIL, $ NMAX, NS, NSI, NSV, NTEST REAL ANORM, DIF, DIV, OVFL, RTUNFL, ULP, - $ ULPINV, UNFL, VL, VU + $ ULPINV, UNFL, VL, VU +* .. +* .. Local Scalars for DGESVDQ .. + INTEGER LIWORK, LRWORK, NUMRANK +* .. +* .. Local Arrays for DGESVDQ .. + REAL RWORK( 2 ) * .. * .. Local Arrays .. CHARACTER CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 ) INTEGER IOLDSD( 4 ), ISEED2( 4 ) - REAL RESULT( 40 ) + REAL RESULT( 39 ) * .. * .. External Functions .. REAL SLAMCH, SLARND @@ -404,8 +423,8 @@ * .. * .. External Subroutines .. EXTERNAL ALASVM, SBDT01, SGEJSV, SGESDD, SGESVD, - $ SGESVDX, SGESVJ, SLABAD, SLACPY, SLASET, - $ SLATMS, SORT01, SORT03, XERBLA + $ SGESVDQ, SGESVDX, SGESVJ, SLABAD, SLACPY, + $ SLASET, SLATMS, SORT01, SORT03, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, INT, MAX, MIN @@ -781,8 +800,64 @@ RESULT( 14 ) = MAX( RESULT( 14 ), DIF ) 110 CONTINUE * -* Test SGESVJ: Factorize A -* Note: SGESVJ does not work for M < N +* Test SGESVDQ +* Note: SGESVDQ only works for M >= N +* + RESULT( 36 ) = ZERO + RESULT( 37 ) = ZERO + RESULT( 38 ) = ZERO + RESULT( 39 ) = ZERO +* + IF( M.GE.N ) THEN + IWTMP = 5*MNMIN*MNMIN + 9*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWS-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + IF( IWS.EQ.4 ) + $ LSWORK = LWORK +* + CALL SLACPY( 'F', M, N, ASAV, LDA, A, LDA ) + SRNAMT = 'SGESVDQ' +* + LRWORK = 2 + LIWORK = MAX( N, 1 ) + CALL SGESVDQ( 'H', 'N', 'N', 'A', 'A', + $ M, N, A, LDA, SSAV, USAV, LDU, + $ VTSAV, LDVT, NUMRANK, IWORK, LIWORK, + $ WORK, LWORK, RWORK, LRWORK, IINFO ) +* + IF( IINFO.NE.0 ) THEN + WRITE( NOUT, FMT = 9995 )'SGESVDQ', IINFO, M, N, + $ JTYPE, LSWORK, IOLDSD + INFO = ABS( IINFO ) + RETURN + END IF +* +* Do tests 36--39 +* + CALL SBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E, + $ VTSAV, LDVT, WORK, RESULT( 36 ) ) + IF( M.NE.0 .AND. N.NE.0 ) THEN + CALL SORT01( 'Columns', M, M, USAV, LDU, WORK, + $ LWORK, RESULT( 37 ) ) + CALL SORT01( 'Rows', N, N, VTSAV, LDVT, WORK, + $ LWORK, RESULT( 38 ) ) + END IF + RESULT( 39 ) = ZERO + DO 199 I = 1, MNMIN - 1 + IF( SSAV( I ).LT.SSAV( I+1 ) ) + $ RESULT( 39 ) = ULPINV + IF( SSAV( I ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + 199 CONTINUE + IF( MNMIN.GE.1 ) THEN + IF( SSAV( MNMIN ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + END IF + END IF +* +* Test SGESVJ +* Note: SGESVJ only works for M >= N * RESULT( 15 ) = ZERO RESULT( 16 ) = ZERO @@ -802,8 +877,7 @@ CALL SGESVJ( 'G', 'U', 'V', M, N, USAV, LDA, SSAV, & 0, A, LDVT, WORK, LWORK, INFO ) * -* SGESVJ retuns V not VT, so we transpose to use the same -* test suite. +* SGESVJ returns V not VT * DO J=1,N DO I=1,N @@ -841,8 +915,8 @@ END IF END IF * -* Test SGEJSV: Factorize A -* Note: SGEJSV does not work for M < N +* Test SGEJSV +* Note: SGEJSV only works for M >= N * RESULT( 19 ) = ZERO RESULT( 20 ) = ZERO @@ -862,8 +936,7 @@ & M, N, VTSAV, LDA, SSAV, USAV, LDU, A, LDVT, & WORK, LWORK, IWORK, INFO ) * -* SGEJSV retuns V not VT, so we transpose to use the same -* test suite. +* SGEJSV returns V not VT * DO 140 J=1,N DO 130 I=1,N @@ -872,7 +945,7 @@ 140 END DO * IF( IINFO.NE.0 ) THEN - WRITE( NOUT, FMT = 9995 )'GESVJ', IINFO, M, N, + WRITE( NOUT, FMT = 9995 )'GEJSV', IINFO, M, N, $ JTYPE, LSWORK, IOLDSD INFO = ABS( IINFO ) RETURN @@ -1086,7 +1159,7 @@ * * End of Loop -- Check for RESULT(j) > THRESH * - DO 210 J = 1, 35 + DO 210 J = 1, 39 IF( RESULT( J ).GE.THRESH ) THEN IF( NFAIL.EQ.0 ) THEN WRITE( NOUT, FMT = 9999 ) @@ -1097,7 +1170,7 @@ NFAIL = NFAIL + 1 END IF 210 CONTINUE - NTEST = NTEST + 35 + NTEST = NTEST + 39 220 CONTINUE 230 CONTINUE 240 CONTINUE @@ -1158,6 +1231,12 @@ $ ' SGESVDX(V,V,V) ', $ / '34 = | I - U**T U | / ( M ulp ) ', $ / '35 = | I - VT VT**T | / ( N ulp ) ', + $ ' SGESVDQ(H,N,N,A,A', + $ / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ', + $ / '37 = | I - U**T U | / ( M ulp ) ', + $ / '38 = | I - VT VT**T | / ( N ulp ) ', + $ / '39 = 0 if S contains min(M,N) nonnegative values in', + $ ' decreasing order, else 1/ulp', $ / / ) 9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1, $ ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 ) diff --git a/lapack-netlib/TESTING/EIG/serred.f b/lapack-netlib/TESTING/EIG/serred.f index f478fcdb1..7d3772e84 100644 --- a/lapack-netlib/TESTING/EIG/serred.f +++ b/lapack-netlib/TESTING/EIG/serred.f @@ -36,6 +36,8 @@ *> SGEJSV compute SVD of an M-by-N matrix A where M >= N *> SGESVDX compute SVD of an M-by-N matrix A(by bisection *> and inverse iteration) +*> SGESVDQ compute SVD of an M-by-N matrix A(with a +*> QR-Preconditioned ) *> \endverbatim * * Arguments: @@ -100,7 +102,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, SGEES, SGEESX, SGEEV, SGEEVX, SGEJSV, - $ SGESDD, SGESVD + $ SGESDD, SGESVD, SGESVDX, SGESVDQ * .. * .. External Functions .. LOGICAL SSLECT, LSAMEN @@ -486,6 +488,61 @@ ELSE WRITE( NOUT, FMT = 9998 ) END IF +* +* Test SGESVDQ +* + SRNAMT = 'SGESVDQ' + INFOT = 1 + CALL SGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ -1, VT, 0, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, -1, NS, IW, 1, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 17 + CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, 1, NS, IW, -5, W, 1, W, 1, INFO ) + CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK ) + NT = 11 + IF( OK ) THEN + WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), + $ NT + ELSE + WRITE( NOUT, FMT = 9998 ) + END IF END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/sget39.f b/lapack-netlib/TESTING/EIG/sget39.f index f02c6f856..f6c0f7e7c 100644 --- a/lapack-netlib/TESTING/EIG/sget39.f +++ b/lapack-netlib/TESTING/EIG/sget39.f @@ -194,7 +194,7 @@ VM5( 2 ) = EPS VM5( 3 ) = SQRT( SMLNUM ) * -* Initalization +* Initialization * KNT = 0 RMAX = ZERO diff --git a/lapack-netlib/TESTING/EIG/ssbt21.f b/lapack-netlib/TESTING/EIG/ssbt21.f index 50128ddbb..7ef5ad9b3 100644 --- a/lapack-netlib/TESTING/EIG/ssbt21.f +++ b/lapack-netlib/TESTING/EIG/ssbt21.f @@ -28,15 +28,16 @@ *> *> SSBT21 generally checks a decomposition of the form *> -*> A = U S U' +*> A = U S U**T *> -*> where ' means transpose, A is symmetric banded, U is +*> where **T means transpose, A is symmetric banded, U is *> orthogonal, and S is diagonal (if KS=0) or symmetric *> tridiagonal (if KS=1). *> *> Specifically: *> -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> \endverbatim * * Arguments: @@ -214,7 +215,7 @@ * ANORM = MAX( SLANSB( '1', CUPLO, N, IKA, A, LDA, WORK ), UNFL ) * -* Compute error matrix: Error = A - U S U' +* Compute error matrix: Error = A - U S U**T * * Copy A from SB to SP storage format. * @@ -265,7 +266,7 @@ * * Do Test 2 * -* Compute UU' - I +* Compute U U**T - I * CALL SGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK, $ N ) diff --git a/lapack-netlib/TESTING/EIG/sspt21.f b/lapack-netlib/TESTING/EIG/sspt21.f index 2384c87de..4ecb04c0e 100644 --- a/lapack-netlib/TESTING/EIG/sspt21.f +++ b/lapack-netlib/TESTING/EIG/sspt21.f @@ -28,9 +28,9 @@ *> *> SSPT21 generally checks a decomposition of the form *> -*> A = U S U' +*> A = U S U**T *> -*> where ' means transpose, A is symmetric (stored in packed format), U +*> where **T means transpose, A is symmetric (stored in packed format), U *> is orthogonal, and S is diagonal (if KBAND=0) or symmetric *> tridiagonal (if KBAND=1). If ITYPE=1, then U is represented as a *> dense matrix, otherwise the U is expressed as a product of @@ -41,15 +41,16 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> *> Packed storage means that, for example, if UPLO='U', then the columns *> of the upper triangle of A are stored one after another, so that @@ -70,7 +71,7 @@ *> *> If UPLO='U', then V = H(n-1)...H(1), where *> -*> H(j) = I - tau(j) v(j) v(j)' +*> H(j) = I - tau(j) v(j) v(j)**T *> *> and the first j-1 elements of v(j) are stored in V(1:j-1,j+1), *> (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ), @@ -78,7 +79,7 @@ *> *> If UPLO='L', then V = H(1)...H(n-1), where *> -*> H(j) = I - tau(j) v(j) v(j)' +*> H(j) = I - tau(j) v(j) v(j)**T *> *> and the first j elements of v(j) are 0, the (j+1)-st is 1, and the *> (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e., @@ -93,14 +94,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> 3: U expressed both as a dense orthogonal matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -183,7 +185,7 @@ *> \verbatim *> TAU is REAL array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**T in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -303,7 +305,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U' +* ITYPE=1: error = A - U S U**T * CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) CALL SCOPY( LAP, AP, 1, WORK, 1 ) @@ -322,7 +324,7 @@ * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V' - A +* ITYPE=2: error = V S V**T - A * CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) * @@ -389,7 +391,7 @@ * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V' - I +* ITYPE=3: error = U V**T - I * IF( N.LT.2 ) $ RETURN @@ -420,7 +422,7 @@ * * Do Test 2 * -* Compute UU' - I +* Compute U U**T - I * IF( ITYPE.EQ.1 ) THEN CALL SGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK, diff --git a/lapack-netlib/TESTING/EIG/ssyt21.f b/lapack-netlib/TESTING/EIG/ssyt21.f index a7add3418..fc7ca6a2a 100644 --- a/lapack-netlib/TESTING/EIG/ssyt21.f +++ b/lapack-netlib/TESTING/EIG/ssyt21.f @@ -28,9 +28,9 @@ *> *> SSYT21 generally checks a decomposition of the form *> -*> A = U S U' +*> A = U S U**T *> -*> where ' means transpose, A is symmetric, U is orthogonal, and S is +*> where **T means transpose, A is symmetric, U is orthogonal, and S is *> diagonal (if KBAND=0) or symmetric tridiagonal (if KBAND=1). *> *> If ITYPE=1, then U is represented as a dense matrix; otherwise U is @@ -41,18 +41,19 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> *> For ITYPE > 1, the transformation U is expressed as a product -*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)' and each +*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)**T and each *> vector v(j) has its first j elements 0 and the remaining n-j elements *> stored in V(j+1:n,j). *> \endverbatim @@ -65,14 +66,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V' | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**T | / ( |A| n ulp ) *> *> 3: U expressed both as a dense orthogonal matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - VU' | / ( n ulp ) +*> RESULT(1) = | I - V U**T | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -170,7 +172,7 @@ *> \verbatim *> TAU is REAL array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**T in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -283,7 +285,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U' +* ITYPE=1: error = A - U S U**T * CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) CALL SLACPY( CUPLO, N, N, A, LDA, WORK, N ) @@ -302,7 +304,7 @@ * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V' - A +* ITYPE=2: error = V S V**T - A * CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N ) * @@ -359,7 +361,7 @@ * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V' - I +* ITYPE=3: error = U V**T - I * IF( N.LT.2 ) $ RETURN @@ -395,7 +397,7 @@ * * Do Test 2 * -* Compute UU' - I +* Compute U U**T - I * IF( ITYPE.EQ.1 ) THEN CALL SGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK, diff --git a/lapack-netlib/TESTING/EIG/ssyt22.f b/lapack-netlib/TESTING/EIG/ssyt22.f index 3b748ec7f..38fc3e555 100644 --- a/lapack-netlib/TESTING/EIG/ssyt22.f +++ b/lapack-netlib/TESTING/EIG/ssyt22.f @@ -41,7 +41,8 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC> RESULT(2) = | I - U'U | / ( m ulp ) +*> RESULT(1) = | U**T A U - S | / ( |A| m ulp ) and +*> RESULT(2) = | I - U**T U | / ( m ulp ) *> \endverbatim * * Arguments: @@ -51,7 +52,8 @@ *> ITYPE INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**T | / ( n ulp ) *> *> UPLO CHARACTER *> If UPLO='U', the upper triangle of A will be used and the @@ -122,7 +124,7 @@ *> *> TAU REAL array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**T in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> Not modified. @@ -207,7 +209,7 @@ * * Compute error matrix: * -* ITYPE=1: error = U' A U - S +* ITYPE=1: error = U**T A U - S * CALL SSYMM( 'L', UPLO, N, M, ONE, A, LDA, U, LDU, ZERO, WORK, N ) NN = N*N @@ -240,7 +242,7 @@ * * Do Test 2 * -* Compute U'U - I +* Compute U**T U - I * IF( ITYPE.EQ.1 ) $ CALL SORT01( 'Columns', N, M, U, LDU, WORK, 2*N*N, diff --git a/lapack-netlib/TESTING/EIG/zbdt05.f b/lapack-netlib/TESTING/EIG/zbdt05.f index 7a493292a..bbf0208b7 100644 --- a/lapack-netlib/TESTING/EIG/zbdt05.f +++ b/lapack-netlib/TESTING/EIG/zbdt05.f @@ -52,6 +52,7 @@ *> \verbatim *> A is COMPLEX*16 array, dimension (LDA,N) *> The m by n matrix A. +*> \endverbatim *> *> \param[in] LDA *> \verbatim diff --git a/lapack-netlib/TESTING/EIG/zchkst.f b/lapack-netlib/TESTING/EIG/zchkst.f index 4a8636ad9..cd45e98e1 100644 --- a/lapack-netlib/TESTING/EIG/zchkst.f +++ b/lapack-netlib/TESTING/EIG/zchkst.f @@ -167,7 +167,7 @@ *> ZSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because ZSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z* | / ( |S| n ulp ) ZSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f index cd952bc37..167e5f359 100644 --- a/lapack-netlib/TESTING/EIG/zchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f @@ -188,7 +188,7 @@ *> ZSTEMR('V', 'I') *> *> Tests 29 through 34 are disable at present because ZSTEMR -*> does not handle partial specturm requests. +*> does not handle partial spectrum requests. *> *> (29) | S - Z D Z* | / ( |S| n ulp ) ZSTEMR('V', 'I') *> diff --git a/lapack-netlib/TESTING/EIG/zdrgev3.f b/lapack-netlib/TESTING/EIG/zdrgev3.f index 62ddf2b56..11e8562d7 100644 --- a/lapack-netlib/TESTING/EIG/zdrgev3.f +++ b/lapack-netlib/TESTING/EIG/zdrgev3.f @@ -389,7 +389,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date Febuary 2015 +*> \date February 2015 * *> \ingroup complex16_eig * diff --git a/lapack-netlib/TESTING/EIG/zdrgsx.f b/lapack-netlib/TESTING/EIG/zdrgsx.f index 51a7d773f..f5821e520 100644 --- a/lapack-netlib/TESTING/EIG/zdrgsx.f +++ b/lapack-netlib/TESTING/EIG/zdrgsx.f @@ -738,7 +738,7 @@ CALL ZLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA ) CALL ZLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA ) * -* Compute the Schur factorization while swaping the +* Compute the Schur factorization while swapping the * m-by-m (1,1)-blocks with n-by-n (2,2)-blocks. * CALL ZGGESX( 'V', 'V', 'S', ZLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA, diff --git a/lapack-netlib/TESTING/EIG/zdrvbd.f b/lapack-netlib/TESTING/EIG/zdrvbd.f index 4bdbdfe2e..105e9dff7 100644 --- a/lapack-netlib/TESTING/EIG/zdrvbd.f +++ b/lapack-netlib/TESTING/EIG/zdrvbd.f @@ -33,8 +33,9 @@ *> *> \verbatim *> -*> ZDRVBD checks the singular value decomposition (SVD) driver ZGESVD -*> and ZGESDD. +*> ZDRVBD checks the singular value decomposition (SVD) driver ZGESVD, +*> ZGESDD, ZGESVJ, ZGEJSV, ZGESVDX, and ZGESVDQ. +*> *> ZGESVD and ZGESDD factors A = U diag(S) VT, where U and VT are *> unitary and diag(S) is diagonal with the entries of the array S on *> its diagonal. The entries of S are the singular values, nonnegative @@ -73,81 +74,92 @@ *> *> Test for ZGESDD: *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (8) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (9) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (10) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (11) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> -*> (5) | U - Upartial | / ( M ulp ) where Upartial is a partially +*> (12) | U - Upartial | / ( M ulp ) where Upartial is a partially *> computed U. *> -*> (6) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially +*> (13) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially *> computed VT. *> -*> (7) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the +*> (14) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the *> vector of singular values from the partial SVD *> +*> Test for ZGESVDQ: +*> +*> (36) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> +*> (37) | I - U'U | / ( M ulp ) +*> +*> (38) | I - VT VT' | / ( N ulp ) +*> +*> (39) S contains MNMIN nonnegative values in decreasing order. +*> (Return 0 if true, 1/ULP if false.) +*> *> Test for ZGESVJ: *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (15) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (16) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (17) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (18) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> *> Test for ZGEJSV: *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (19) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (20) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (21) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (22) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> *> Test for ZGESVDX( 'V', 'V', 'A' )/ZGESVDX( 'N', 'N', 'A' ) *> -*> (1) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) +*> (23) | A - U diag(S) VT | / ( |A| max(M,N) ulp ) *> -*> (2) | I - U'U | / ( M ulp ) +*> (24) | I - U'U | / ( M ulp ) *> -*> (3) | I - VT VT' | / ( N ulp ) +*> (25) | I - VT VT' | / ( N ulp ) *> -*> (4) S contains MNMIN nonnegative values in decreasing order. +*> (26) S contains MNMIN nonnegative values in decreasing order. *> (Return 0 if true, 1/ULP if false.) *> -*> (5) | U - Upartial | / ( M ulp ) where Upartial is a partially +*> (27) | U - Upartial | / ( M ulp ) where Upartial is a partially *> computed U. *> -*> (6) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially +*> (28) | VT - VTpartial | / ( N ulp ) where VTpartial is a partially *> computed VT. *> -*> (7) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the +*> (29) | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the *> vector of singular values from the partial SVD *> *> Test for ZGESVDX( 'V', 'V', 'I' ) *> -*> (8) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) +*> (30) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) *> -*> (9) | I - U'U | / ( M ulp ) +*> (31) | I - U'U | / ( M ulp ) *> -*> (10) | I - VT VT' | / ( N ulp ) +*> (32) | I - VT VT' | / ( N ulp ) *> *> Test for ZGESVDX( 'V', 'V', 'V' ) *> -*> (11) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) +*> (33) | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp ) *> -*> (12) | I - U'U | / ( M ulp ) +*> (34) | I - U'U | / ( M ulp ) *> -*> (13) | I - VT VT' | / ( N ulp ) +*> (35) | I - VT VT' | / ( N ulp ) *> *> The "sizes" are specified by the arrays MM(1:NSIZES) and *> NN(1:NSIZES); the value of each element pair (MM(j),NN(j)) @@ -393,6 +405,8 @@ * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * June 2016 +* + IMPLICIT NONE * * .. Scalar Arguments .. INTEGER INFO, LDA, LDU, LDVT, LWORK, NOUNIT, NSIZES, @@ -411,7 +425,7 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ZERO, ONE, TWO, HALF + DOUBLE PRECISION ZERO, ONE, TWO, HALF PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0, $ HALF = 0.5D0 ) COMPLEX*16 CZERO, CONE @@ -431,10 +445,13 @@ DOUBLE PRECISION ANORM, DIF, DIV, OVFL, RTUNFL, ULP, ULPINV, $ UNFL, VL, VU * .. +* .. Local Scalars for ZGESVDQ .. + INTEGER LIWORK, NUMRANK +* .. * .. Local Arrays .. CHARACTER CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 ) INTEGER IOLDSD( 4 ), ISEED2( 4 ) - DOUBLE PRECISION RESULT( 35 ) + DOUBLE PRECISION RESULT( 39 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLARND @@ -442,8 +459,8 @@ * .. * .. External Subroutines .. EXTERNAL ALASVM, XERBLA, ZBDT01, ZBDT05, ZGESDD, - $ ZGESVD, ZGESVJ, ZGEJSV, ZGESVDX, ZLACPY, - $ ZLASET, ZLATMS, ZUNT01, ZUNT03 + $ ZGESVD, ZGESVDQ, ZGESVJ, ZGEJSV, ZGESVDX, + $ ZLACPY, ZLASET, ZLATMS, ZUNT01, ZUNT03 * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN @@ -836,10 +853,65 @@ 120 CONTINUE RESULT( 14 ) = MAX( RESULT( 14 ), DIF ) 130 CONTINUE - * -* Test ZGESVJ: Factorize A -* Note: ZGESVJ does not work for M < N +* Test ZGESVDQ +* Note: ZGESVDQ only works for M >= N +* + RESULT( 36 ) = ZERO + RESULT( 37 ) = ZERO + RESULT( 38 ) = ZERO + RESULT( 39 ) = ZERO +* + IF( M.GE.N ) THEN + IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + IF( IWSPC.EQ.4 ) + $ LSWORK = LWORK +* + CALL ZLACPY( 'F', M, N, ASAV, LDA, A, LDA ) + SRNAMT = 'ZGESVDQ' +* + LRWORK = MAX(2, M, 5*N) + LIWORK = MAX( N, 1 ) + CALL ZGESVDQ( 'H', 'N', 'N', 'A', 'A', + $ M, N, A, LDA, SSAV, USAV, LDU, + $ VTSAV, LDVT, NUMRANK, IWORK, LIWORK, + $ WORK, LWORK, RWORK, LRWORK, IINFO ) +* + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9995 )'ZGESVDQ', IINFO, M, N, + $ JTYPE, LSWORK, IOLDSD + INFO = ABS( IINFO ) + RETURN + END IF +* +* Do tests 36--39 +* + CALL ZBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E, + $ VTSAV, LDVT, WORK, RWORK, RESULT( 36 ) ) + IF( M.NE.0 .AND. N.NE.0 ) THEN + CALL ZUNT01( 'Columns', M, M, USAV, LDU, WORK, + $ LWORK, RWORK, RESULT( 37 ) ) + CALL ZUNT01( 'Rows', N, N, VTSAV, LDVT, WORK, + $ LWORK, RWORK, RESULT( 38 ) ) + END IF + RESULT( 39 ) = ZERO + DO 199 I = 1, MNMIN - 1 + IF( SSAV( I ).LT.SSAV( I+1 ) ) + $ RESULT( 39 ) = ULPINV + IF( SSAV( I ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + 199 CONTINUE + IF( MNMIN.GE.1 ) THEN + IF( SSAV( MNMIN ).LT.ZERO ) + $ RESULT( 39 ) = ULPINV + END IF + END IF +* +* Test ZGESVJ +* Note: ZGESVJ only works for M >= N * RESULT( 15 ) = ZERO RESULT( 16 ) = ZERO @@ -847,13 +919,13 @@ RESULT( 18 ) = ZERO * IF( M.GE.N ) THEN - IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) - LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 - LSWORK = MIN( LSWORK, LWORK ) - LSWORK = MAX( LSWORK, 1 ) - LRWORK = MAX(6,N) - IF( IWSPC.EQ.4 ) - $ LSWORK = LWORK + IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + LRWORK = MAX(6,N) + IF( IWSPC.EQ.4 ) + $ LSWORK = LWORK * CALL ZLACPY( 'F', M, N, ASAV, LDA, USAV, LDA ) SRNAMT = 'ZGESVJ' @@ -861,8 +933,7 @@ & 0, A, LDVT, WORK, LWORK, RWORK, & LRWORK, IINFO ) * -* ZGESVJ retuns V not VT, so we transpose to use the same -* test suite. +* ZGESVJ returns V not VH * DO J=1,N DO I=1,N @@ -900,21 +971,21 @@ END IF END IF * -* Test ZGEJSV: Factorize A -* Note: ZGEJSV does not work for M < N +* Test ZGEJSV +* Note: ZGEJSV only works for M >= N * RESULT( 19 ) = ZERO RESULT( 20 ) = ZERO RESULT( 21 ) = ZERO RESULT( 22 ) = ZERO IF( M.GE.N ) THEN - IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) - LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 - LSWORK = MIN( LSWORK, LWORK ) - LSWORK = MAX( LSWORK, 1 ) - IF( IWSPC.EQ.4 ) - $ LSWORK = LWORK - LRWORK = MAX( 7, N + 2*M) + IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N ) + LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3 + LSWORK = MIN( LSWORK, LWORK ) + LSWORK = MAX( LSWORK, 1 ) + IF( IWSPC.EQ.4 ) + $ LSWORK = LWORK + LRWORK = MAX( 7, N + 2*M) * CALL ZLACPY( 'F', M, N, ASAV, LDA, VTSAV, LDA ) SRNAMT = 'ZGEJSV' @@ -923,8 +994,7 @@ & WORK, LWORK, RWORK, & LRWORK, IWORK, IINFO ) * -* ZGEJSV retuns V not VT, so we transpose to use the same -* test suite. +* ZGEJSV returns V not VH * DO 133 J=1,N DO 132 I=1,N @@ -933,7 +1003,7 @@ 133 END DO * IF( IINFO.NE.0 ) THEN - WRITE( NOUNIT, FMT = 9995 )'GESVJ', IINFO, M, N, + WRITE( NOUNIT, FMT = 9995 )'GEJSV', IINFO, M, N, $ JTYPE, LSWORK, IOLDSD INFO = ABS( IINFO ) RETURN @@ -1160,7 +1230,7 @@ * NTEST = 0 NFAIL = 0 - DO 190 J = 1, 35 + DO 190 J = 1, 39 IF( RESULT( J ).GE.ZERO ) $ NTEST = NTEST + 1 IF( RESULT( J ).GE.THRESH ) @@ -1175,7 +1245,7 @@ NTESTF = 2 END IF * - DO 200 J = 1, 35 + DO 200 J = 1, 39 IF( RESULT( J ).GE.THRESH ) THEN WRITE( NOUNIT, FMT = 9997 )M, N, JTYPE, IWSPC, $ IOLDSD, J, RESULT( J ) @@ -1251,6 +1321,12 @@ $ / '33 = | U**T A VT**T - diag(S) | / ( |A| max(M,N) ulp )', $ / '34 = | I - U**T U | / ( M ulp ) ', $ / '35 = | I - VT VT**T | / ( N ulp ) ', + $ ' ZGESVDQ(H,N,N,A,A', + $ / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ', + $ / '37 = | I - U**T U | / ( M ulp ) ', + $ / '38 = | I - VT VT**T | / ( N ulp ) ', + $ / '39 = 0 if S contains min(M,N) nonnegative values in', + $ ' decreasing order, else 1/ulp', $ / / ) 9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1, $ ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 ) diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f index 00bfbf261..013dc16c5 100644 --- a/lapack-netlib/TESTING/EIG/zerred.f +++ b/lapack-netlib/TESTING/EIG/zerred.f @@ -36,6 +36,8 @@ *> ZGEJSV compute SVD of an M-by-N matrix A where M >= N *> ZGESVDX compute SVD of an M-by-N matrix A(by bisection *> and inverse iteration) +*> ZGESVDQ compute SVD of an M-by-N matrix A(with a +*> QR-Preconditioned ) *> \endverbatim * * Arguments: @@ -101,7 +103,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, ZGEES, ZGEESX, ZGEEV, ZGEEVX, ZGESVJ, - $ ZGESDD, ZGESVD + $ ZGESDD, ZGESVD, ZGESVDX, ZGESVQ * .. * .. External Functions .. LOGICAL LSAMEN, ZSLECT @@ -495,6 +497,61 @@ ELSE WRITE( NOUT, FMT = 9998 ) END IF +* +* Test ZGESVDQ +* + SRNAMT = 'ZGESVDQ' + INFOT = 1 + CALL ZGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U, + $ 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ -1, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, -1, NS, IW, 1, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + INFOT = 17 + CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U, + $ 1, VT, 1, NS, IW, -5, W, 1, RW, 1, INFO ) + CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK ) + NT = 11 + IF( OK ) THEN + WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), + $ NT + ELSE + WRITE( NOUT, FMT = 9998 ) + END IF END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/zget51.f b/lapack-netlib/TESTING/EIG/zget51.f index 96b1dfae4..e019127a3 100644 --- a/lapack-netlib/TESTING/EIG/zget51.f +++ b/lapack-netlib/TESTING/EIG/zget51.f @@ -29,12 +29,13 @@ *> *> ZGET51 generally checks a decomposition of the form *> -*> A = U B VC> -*> where * means conjugate transpose and U and V are unitary. +*> A = U B V**H +*> +*> where **H means conjugate transpose and U and V are unitary. *> *> Specifically, if ITYPE=1 *> -*> RESULT = | A - U B V* | / ( |A| n ulp ) +*> RESULT = | A - U B V**H | / ( |A| n ulp ) *> *> If ITYPE=2, then: *> @@ -42,7 +43,7 @@ *> *> If ITYPE=3, then: *> -*> RESULT = | I - UU* | / ( n ulp ) +*> RESULT = | I - U U**H | / ( n ulp ) *> \endverbatim * * Arguments: @@ -52,9 +53,9 @@ *> \verbatim *> ITYPE is INTEGER *> Specifies the type of tests to be performed. -*> =1: RESULT = | A - U B V* | / ( |A| n ulp ) +*> =1: RESULT = | A - U B V**H | / ( |A| n ulp ) *> =2: RESULT = | A - B | / ( |A| n ulp ) -*> =3: RESULT = | I - UU* | / ( n ulp ) +*> =3: RESULT = | I - U U**H | / ( n ulp ) *> \endverbatim *> *> \param[in] N @@ -218,7 +219,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: Compute W = A - UBV' +* ITYPE=1: Compute W = A - U B V**H * CALL ZLACPY( ' ', N, N, A, LDA, WORK, N ) CALL ZGEMM( 'N', 'N', N, N, N, CONE, U, LDU, B, LDB, CZERO, @@ -259,7 +260,7 @@ * * Tests not scaled by norm(A) * -* ITYPE=3: Compute UU' - I +* ITYPE=3: Compute U U**H - I * CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, $ WORK, N ) diff --git a/lapack-netlib/TESTING/EIG/zhbt21.f b/lapack-netlib/TESTING/EIG/zhbt21.f index 4cd8ed9f7..68125854c 100644 --- a/lapack-netlib/TESTING/EIG/zhbt21.f +++ b/lapack-netlib/TESTING/EIG/zhbt21.f @@ -28,14 +28,16 @@ *> *> ZHBT21 generally checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is hermitian banded, U is +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is hermitian banded, U is *> unitary, and S is diagonal (if KS=0) or symmetric *> tridiagonal (if KS=1). *> *> Specifically: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> \endverbatim * * Arguments: @@ -220,7 +222,7 @@ * ANORM = MAX( ZLANHB( '1', CUPLO, N, IKA, A, LDA, RWORK ), UNFL ) * -* Compute error matrix: Error = A - U S U* +* Compute error matrix: Error = A - U S U**H * * Copy A from SB to SP storage format. * @@ -271,7 +273,7 @@ * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK, $ N ) diff --git a/lapack-netlib/TESTING/EIG/zhet21.f b/lapack-netlib/TESTING/EIG/zhet21.f index f6cb2d70a..cb854a850 100644 --- a/lapack-netlib/TESTING/EIG/zhet21.f +++ b/lapack-netlib/TESTING/EIG/zhet21.f @@ -29,8 +29,9 @@ *> *> ZHET21 generally checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is hermitian, U is unitary, and +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is hermitian, U is unitary, and *> S is diagonal (if KBAND=0) or (real) symmetric tridiagonal (if *> KBAND=1). *> @@ -42,18 +43,19 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> *> For ITYPE > 1, the transformation U is expressed as a product -*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)C> and each +*> V = H(1)...H(n-2), where H(j) = I - tau(j) v(j) v(j)**H and each *> vector v(j) has its first j elements 0 and the remaining n-j elements *> stored in V(j+1:n,j). *> \endverbatim @@ -66,14 +68,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense unitary matrix: -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> 3: U expressed both as a dense unitary matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -171,7 +174,7 @@ *> \verbatim *> TAU is COMPLEX*16 array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)* in the Householder transformation H(j) of +*> v(j) v(j)**H in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -294,7 +297,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U* +* ITYPE=1: error = A - U S U**H * CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) CALL ZLACPY( CUPLO, N, N, A, LDA, WORK, N ) @@ -304,7 +307,6 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN -CMK DO 20 J = 1, N - 1 DO 20 J = 2, N - 1 CALL ZHER2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) @@ -314,7 +316,7 @@ CMK DO 20 J = 1, N - 1 * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V* - A +* ITYPE=2: error = V S V**H - A * CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) * @@ -371,7 +373,7 @@ CMK DO 20 J = 1, N - 1 * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V* - I +* ITYPE=3: error = U V**H - I * IF( N.LT.2 ) $ RETURN @@ -407,7 +409,7 @@ CMK DO 20 J = 1, N - 1 * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * IF( ITYPE.EQ.1 ) THEN CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, diff --git a/lapack-netlib/TESTING/EIG/zhet22.f b/lapack-netlib/TESTING/EIG/zhet22.f index 7237f43f7..8ef73aef3 100644 --- a/lapack-netlib/TESTING/EIG/zhet22.f +++ b/lapack-netlib/TESTING/EIG/zhet22.f @@ -42,7 +42,8 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC> RESULT(2) = | I - U'U | / ( m ulp ) +*> RESULT(1) = | U**H A U - S | / ( |A| m ulp ) and +*> RESULT(2) = | I - U**H U | / ( m ulp ) *> \endverbatim * * Arguments: @@ -52,7 +53,8 @@ *> ITYPE INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense orthogonal matrix: -*> RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU' | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) *and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> UPLO CHARACTER *> If UPLO='U', the upper triangle of A will be used and the @@ -122,7 +124,7 @@ *> *> TAU COMPLEX*16 array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)' in the Householder transformation H(j) of +*> v(j) v(j)**H in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> Not modified. @@ -215,7 +217,7 @@ * * Compute error matrix: * -* ITYPE=1: error = U' A U - S +* ITYPE=1: error = U**H A U - S * CALL ZHEMM( 'L', UPLO, N, M, CONE, A, LDA, U, LDU, CZERO, WORK, $ N ) @@ -249,7 +251,7 @@ * * Do Test 2 * -* Compute U'U - I +* Compute U**H U - I * IF( ITYPE.EQ.1 ) $ CALL ZUNT01( 'Columns', N, M, U, LDU, WORK, 2*N*N, RWORK, diff --git a/lapack-netlib/TESTING/EIG/zhpt21.f b/lapack-netlib/TESTING/EIG/zhpt21.f index ef9e4418d..825d387c7 100644 --- a/lapack-netlib/TESTING/EIG/zhpt21.f +++ b/lapack-netlib/TESTING/EIG/zhpt21.f @@ -29,8 +29,9 @@ *> *> ZHPT21 generally checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is hermitian, U is +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is hermitian, U is *> unitary, and S is diagonal (if KBAND=0) or (real) symmetric *> tridiagonal (if KBAND=1). If ITYPE=1, then U is represented as *> a dense matrix, otherwise the U is expressed as a product of @@ -41,15 +42,16 @@ *> *> Specifically, if ITYPE=1, then: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> If ITYPE=2, then: *> -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> If ITYPE=3, then: *> -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> *> Packed storage means that, for example, if UPLO='U', then the columns *> of the upper triangle of A are stored one after another, so that @@ -70,14 +72,16 @@ *> *> If UPLO='U', then V = H(n-1)...H(1), where *> -*> H(j) = I - tau(j) v(j) v(j)C> +*> H(j) = I - tau(j) v(j) v(j)**H +*> *> and the first j-1 elements of v(j) are stored in V(1:j-1,j+1), *> (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ), *> the j-th element is 1, and the last n-j elements are 0. *> *> If UPLO='L', then V = H(1)...H(n-1), where *> -*> H(j) = I - tau(j) v(j) v(j)C> +*> H(j) = I - tau(j) v(j) v(j)**H +*> *> and the first j elements of v(j) are 0, the (j+1)-st is 1, and the *> (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e., *> in VP( (2*n-j)*(j-1)/2 + j+2 : (2*n-j)*(j-1)/2 + n ) .) @@ -91,14 +95,15 @@ *> ITYPE is INTEGER *> Specifies the type of tests to be performed. *> 1: U expressed as a dense unitary matrix: -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> *> 2: U expressed as a product V of Housholder transformations: -*> RESULT(1) = | A - V S V* | / ( |A| n ulp ) +*> RESULT(1) = | A - V S V**H | / ( |A| n ulp ) *> *> 3: U expressed both as a dense unitary matrix and *> as a product of Housholder transformations: -*> RESULT(1) = | I - UV* | / ( n ulp ) +*> RESULT(1) = | I - U V**H | / ( n ulp ) *> \endverbatim *> *> \param[in] UPLO @@ -181,7 +186,7 @@ *> \verbatim *> TAU is COMPLEX*16 array, dimension (N) *> If ITYPE >= 2, then TAU(j) is the scalar factor of -*> v(j) v(j)* in the Householder transformation H(j) of +*> v(j) v(j)**H in the Householder transformation H(j) of *> the product U = H(1)...H(n-2) *> If ITYPE < 2, then TAU is not referenced. *> \endverbatim @@ -313,7 +318,7 @@ * IF( ITYPE.EQ.1 ) THEN * -* ITYPE=1: error = A - U S U* +* ITYPE=1: error = A - U S U**H * CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) CALL ZCOPY( LAP, AP, 1, WORK, 1 ) @@ -323,7 +328,6 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN -CMK DO 20 J = 1, N - 1 DO 20 J = 2, N - 1 CALL ZHPR2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK ) @@ -333,7 +337,7 @@ CMK DO 20 J = 1, N - 1 * ELSE IF( ITYPE.EQ.2 ) THEN * -* ITYPE=2: error = V S V* - A +* ITYPE=2: error = V S V**H - A * CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N ) * @@ -401,7 +405,7 @@ CMK DO 20 J = 1, N - 1 * ELSE IF( ITYPE.EQ.3 ) THEN * -* ITYPE=3: error = U V* - I +* ITYPE=3: error = U V**H - I * IF( N.LT.2 ) $ RETURN @@ -432,7 +436,7 @@ CMK DO 20 J = 1, N - 1 * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * IF( ITYPE.EQ.1 ) THEN CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, diff --git a/lapack-netlib/TESTING/EIG/zstt21.f b/lapack-netlib/TESTING/EIG/zstt21.f index ad1fe5529..f2e32a12e 100644 --- a/lapack-netlib/TESTING/EIG/zstt21.f +++ b/lapack-netlib/TESTING/EIG/zstt21.f @@ -28,14 +28,15 @@ *> *> ZSTT21 checks a decomposition of the form *> -*> A = U S UC> -*> where * means conjugate transpose, A is real symmetric tridiagonal, +*> A = U S U**H +*> +*> where **H means conjugate transpose, A is real symmetric tridiagonal, *> U is unitary, and S is real and diagonal (if KBAND=0) or symmetric *> tridiagonal (if KBAND=1). Two tests are performed: *> -*> RESULT(1) = | A - U S U* | / ( |A| n ulp ) +*> RESULT(1) = | A - U S U**H | / ( |A| n ulp ) *> -*> RESULT(2) = | I - UU* | / ( n ulp ) +*> RESULT(2) = | I - U U**H | / ( n ulp ) *> \endverbatim * * Arguments: @@ -228,7 +229,7 @@ * * Do Test 2 * -* Compute UU* - I +* Compute U U**H - I * CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK, $ N ) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 50ba8fc28..c941d3577 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -39,7 +39,8 @@ set(SLINTST schkaa.f strt02.f strt03.f strt05.f strt06.f sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f schklqt.f schklqtp.f schktsqr.f - serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f) + serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f + schkorhr_col.f serrorhr_col.f sorhr_col01.f) if(USE_XBLAS) list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f @@ -94,7 +95,8 @@ set(CLINTST cchkaa.f sget06.f cgennd.f cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f cchklqt.f cchklqtp.f cchktsqr.f - cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f) + cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f + cchkunhr_col.f cerrunhr_col.f cunhr_col01.f) if(USE_XBLAS) list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f @@ -139,7 +141,8 @@ set(DLINTST dchkaa.f dgennd.f dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f dchklq.f dchklqt.f dchklqtp.f dchktsqr.f - derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f) + derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f + dchkorhr_col.f derrorhr_col.f dorhr_col01.f) if(USE_XBLAS) list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f @@ -194,7 +197,8 @@ set(ZLINTST zchkaa.f dget06.f zgennd.f zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f zchklqt.f zchklqtp.f zchktsqr.f - zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f) + zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f + zchkunhr_col.f zerrunhr_col.f zunhr_col01.f) if(USE_XBLAS) list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 1a332f70b..6e790aa93 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -1,5 +1,3 @@ -include ../../make.inc - ####################################################################### # This makefile creates the test programs for the linear equation # routines in LAPACK. The test files are grouped as follows: @@ -33,10 +31,8 @@ include ../../make.inc # ####################################################################### -ifneq ($(strip $(VARLIB)),) - LAPACKLIB := $(VARLIB) ../../$(LAPACKLIB) -endif - +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc ALINTST = \ aladhd.o alaerh.o alaesm.o alahd.o alareq.o \ @@ -77,7 +73,8 @@ SLINTST = schkaa.o \ strt02.o strt03.o strt05.o strt06.o \ sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \ schklqt.o schklqtp.o schktsqr.o \ - serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o + serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \ + schkorhr_col.o serrorhr_col.o sorhr_col01.o ifdef USEXBLAS SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \ @@ -125,7 +122,8 @@ CLINTST = cchkaa.o \ sget06.o cgennd.o \ cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \ cchklqt.o cchklqtp.o cchktsqr.o \ - cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o + cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \ + cchkunhr_col.o cerrunhr_col.o cunhr_col01.o ifdef USEXBLAS CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \ @@ -168,7 +166,8 @@ DLINTST = dchkaa.o \ dgennd.o \ dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \ dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \ - derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o + derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \ + dchkorhr_col.o derrorhr_col.o dorhr_col01.o ifdef USEXBLAS DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \ @@ -215,7 +214,8 @@ ZLINTST = zchkaa.o \ dget06.o zgennd.o \ zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \ zchklqt.o zchklqtp.o zchktsqr.o \ - zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o + zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \ + zchkunhr_col.o zerrunhr_col.o zunhr_col01.o ifdef USEXBLAS ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \ @@ -254,47 +254,50 @@ ZLINTSTRFP = zchkrfp.o zdrvrfp.o zdrvrf1.o zdrvrf2.o zdrvrf3.o zdrvrf4.o zerrrfp zlatb4.o zlaipd.o zlarhs.o zsbmv.o zget04.o zpot01.o zpot03.o zpot02.o \ chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o +.PHONY: all all: single double complex complex16 proto-single proto-double proto-complex proto-complex16 +.PHONY: single double complex complex16 single: xlintsts double: xlintstd complex: xlintstc complex16: xlintstz +.PHONY: proto-single proto-double proto-complex proto-complex16 proto-single: xlintstrfs proto-double: xlintstds xlintstrfd proto-complex: xlintstrfc proto-complex16: xlintstzc xlintstrfz -xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstds: $(DSLINTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstzc: $(ZCLINTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfs: $(SLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfd: $(DLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfc: $(CLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfz: $(ZLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(LOADOPTS) -o $@ $^ +xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(ALINTST): $(FRC) $(SCLNTST): $(FRC) @@ -307,6 +310,7 @@ $(ZLINTST): $(FRC) FRC: @FRC=$(FRC) +.PHONY: clean cleanobj cleanexe clean: cleanobj cleanexe cleanobj: rm -f *.o @@ -314,15 +318,12 @@ cleanexe: rm -f xlintst* schkaa.o: schkaa.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< dchkaa.o: dchkaa.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< cchkaa.o: cchkaa.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< zchkaa.o: zchkaa.f - $(FORTRAN) $(DRVOPTS) -c -o $@ $< - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< + $(FC) $(FFLAGS_DRV) -c -o $@ $< .NOTPARALLEL: diff --git a/lapack-netlib/TESTING/LIN/cchkaa.f b/lapack-netlib/TESTING/LIN/cchkaa.f index d8d5060c3..d36770be7 100644 --- a/lapack-netlib/TESTING/LIN/cchkaa.f +++ b/lapack-netlib/TESTING/LIN/cchkaa.f @@ -74,6 +74,8 @@ *> CEQ *> CQT *> CQX +*> CTS +*> CHH *> \endverbatim * * Parameters: @@ -108,14 +110,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2017 +*> \date November 2019 * *> \ingroup complex_lin * * ===================================================================== PROGRAM CCHKAA * -* -- LAPACK test routine (version 3.8.0) -- +* -- LAPACK test routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * November 2017 @@ -165,15 +167,16 @@ * .. * .. External Subroutines .. EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, - $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKLQ, - $ CCHKPB,CCHKPO, CCHKPS, CCHKPP, CCHKPT, CCHKQ3, - $ CCHKQL, CCHKQR, CCHKRQ, CCHKSP, CCHKSY, - $ CCHKSY_ROOK, CCHKSY_RK, CCHKSY_AA, CCHKTB, - $ CCHKTP, CCHKTR, CCHKTZ, CDRVGB, CDRVGE, CDRVGT, - $ CDRVHE, CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, - $ CDRVHP, CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT, - $ CDRVSP, CDRVSY, CDRVSY_ROOK, CDRVSY_RK, - $ CDRVSY_AA, ILAVER, CCHKQRT, CCHKQRTP + $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, + $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, + $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, + $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, + $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, + $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, + $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, + $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, + $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, + $ CCHKQRT, CCHKQRTP * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -678,7 +681,7 @@ * * HK: Hermitian indefinite matrices, * with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than HR path version. +* different matrix storage format than HR path version. * NTYPES = 10 CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) @@ -838,7 +841,7 @@ * * SK: symmetric indefinite matrices, * with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than SR path version. +* different matrix storage format than SR path version. * NTYPES = 11 CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) @@ -1165,6 +1168,17 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF * ELSE * diff --git a/lapack-netlib/TESTING/LIN/cchkunhr_col.f b/lapack-netlib/TESTING/LIN/cchkunhr_col.f new file mode 100644 index 000000000..00077ddd9 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f @@ -0,0 +1,239 @@ +*> \brief \b CCHKUNHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, +* NBVAL, NOUT ) +* +* .. Scalar Arguments .. +* LOGICAL TSTERR +* INTEGER NM, NN, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR +*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested +*> before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[in] TSTERR +*> \verbatim +*> TSTERR is LOGICAL +*> Flag that indicates whether error exits are to be tested. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB contained in the vector NBVAL. +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NBVAL) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* December 2016 +* +* .. Scalar Arguments .. + LOGICAL TSTERR + INTEGER NM, NN, NNB, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTESTS + PARAMETER ( NTESTS = 6 ) +* .. +* .. Local Scalars .. + CHARACTER(LEN=3) PATH + INTEGER I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1, + $ NB2, NFAIL, NERRS, NRUN +* +* .. Local Arrays .. + REAL RESULT( NTESTS ) +* .. +* .. External Subroutines .. + EXTERNAL ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01 +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Executable Statements .. +* +* Initialize constants +* + PATH( 1: 1 ) = 'C' + PATH( 2: 3 ) = 'HH' + NRUN = 0 + NFAIL = 0 + NERRS = 0 +* +* Test the error exits +* + IF( TSTERR ) CALL CERRUNHR_COL( PATH, NOUT ) + INFOT = 0 +* +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test CUNHR_COL +* + CALL CUNHR_COL01( M, N, MB1, NB1, NB2, + $ RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, + $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + RETURN +* +* End of CCHKUNHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f index 2c2d9abb8..d24e3885b 100644 --- a/lapack-netlib/TESTING/LIN/cdrvls.f +++ b/lapack-netlib/TESTING/LIN/cdrvls.f @@ -237,13 +237,13 @@ REAL EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. - INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ - REAL RESULT( NTESTS ), RWQ - COMPLEX WQ + INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 ) + REAL RESULT( NTESTS ), RWQ( 1 ) + COMPLEX WQ( 1 ) * .. * .. Allocatable Arrays .. COMPLEX, ALLOCATABLE :: WORK (:) - REAL, ALLOCATABLE :: RWORK (:) + REAL, ALLOCATABLE :: RWORK (:), WORK2 (:) INTEGER, ALLOCATABLE :: IWORK (:) * .. * .. External Functions .. @@ -363,32 +363,32 @@ * Compute workspace needed for CGELS CALL CGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) - LWORK_CGELS = INT( WQ ) + LWORK_CGELS = INT( WQ( 1 ) ) * Compute workspace needed for CGETSLS CALL CGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) - LWORK_CGETSLS = INT( WQ ) + LWORK_CGETSLS = INT( WQ( 1 ) ) ENDDO END IF * Compute workspace needed for CGELSY CALL CGELSY( M, N, NRHS, A, LDA, B, LDB, $ IWQ, RCOND, CRANK, WQ, -1, RWORK, $ INFO ) - LWORK_CGELSY = INT( WQ ) + LWORK_CGELSY = INT( WQ( 1 ) ) LRWORK_CGELSY = 2*N * Compute workspace needed for CGELSS CALL CGELSS( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1, RWORK, INFO ) - LWORK_CGELSS = INT( WQ ) + LWORK_CGELSS = INT( WQ( 1 ) ) LRWORK_CGELSS = 5*MNMIN * Compute workspace needed for CGELSD CALL CGELSD( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1, RWQ, IWQ, $ INFO ) - LWORK_CGELSD = INT( WQ ) - LRWORK_CGELSD = INT( RWQ ) + LWORK_CGELSD = INT( WQ( 1 ) ) + LRWORK_CGELSD = INT( RWQ ( 1 ) ) * Compute LIWORK workspace needed for CGELSY and CGELSD - LIWORK = MAX( LIWORK, N, IWQ ) + LIWORK = MAX( LIWORK, N, IWQ ( 1 ) ) * Compute LRWORK workspace needed for CGELSY, CGELSS and CGELSD LRWORK = MAX( LRWORK, LRWORK_CGELSY, $ LRWORK_CGELSS, LRWORK_CGELSD ) @@ -408,6 +408,7 @@ ALLOCATE( WORK( LWORK ) ) ALLOCATE( IWORK( LIWORK ) ) ALLOCATE( RWORK( LRWORK ) ) + ALLOCATE( WORK2( 2 * LWORK ) ) * DO 140 IM = 1, NM M = MVAL( IM ) @@ -596,7 +597,7 @@ $ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL CQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK, + $ LDA, B, LDB, C, LDB, WORK2, $ RESULT( 15 ) ) * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. diff --git a/lapack-netlib/TESTING/LIN/cdrvsy_rk.f b/lapack-netlib/TESTING/LIN/cdrvsy_rk.f index ae313c243..d3ed8c0a9 100644 --- a/lapack-netlib/TESTING/LIN/cdrvsy_rk.f +++ b/lapack-netlib/TESTING/LIN/cdrvsy_rk.f @@ -98,8 +98,9 @@ *> \param[out] E *> \verbatim *> E is COMPLEX array, dimension (NMAX) -*> \param[out] AINV +*> \endverbatim *> +*> \param[out] AINV *> \verbatim *> AINV is COMPLEX array, dimension (NMAX*NMAX) *> \endverbatim diff --git a/lapack-netlib/TESTING/LIN/cerrunhr_col.f b/lapack-netlib/TESTING/LIN/cerrunhr_col.f new file mode 100644 index 000000000..8fd58a683 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cerrunhr_col.f @@ -0,0 +1,164 @@ +*> \brief \b CERRUNHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CERRUNHR_COL( PATH, NUNIT ) +* +* .. Scalar Arguments .. +* CHARACTER*3 PATH +* INTEGER NUNIT +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CERRUNHR_COL tests the error exits for CUNHR_COL that does +*> Householder reconstruction from the ouput of tall-skinny +*> factorization CLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] PATH +*> \verbatim +*> PATH is CHARACTER*3 +*> The LAPACK path name for the routines to be tested. +*> \endverbatim +*> +*> \param[in] NUNIT +*> \verbatim +*> NUNIT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CERRUNHR_COL( PATH, NUNIT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + CHARACTER(LEN=3) PATH + INTEGER NUNIT +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 2 ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, J +* .. +* .. Local Arrays .. + COMPLEX A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX) +* .. +* .. External Subroutines .. + EXTERNAL ALAESM, CHKXER, CUNHR_COL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NOUT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NOUT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, CMPLX +* .. +* .. Executable Statements .. +* + NOUT = NUNIT + WRITE( NOUT, FMT = * ) +* +* Set the variables to innocuous values. +* + DO J = 1, NMAX + DO I = 1, NMAX + A( I, J ) = CMPLX( 1.E+0 / REAL( I+J ) ) + T( I, J ) = CMPLX( 1.E+0 / REAL( I+J ) ) + END DO + D( J ) = ( 0.E+0, 0.E+0 ) + END DO + OK = .TRUE. +* +* Error exits for Householder reconstruction +* +* CUNHR_COL +* + SRNAMT = 'CUNHR_COL' +* + INFOT = 1 + CALL CUNHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 2 + CALL CUNHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) + CALL CUNHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 3 + CALL CUNHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL CUNHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 5 + CALL CUNHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL CUNHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL CUNHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 7 + CALL CUNHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL CUNHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL CUNHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO ) + CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK ) +* +* Print a summary line. +* + CALL ALAESM( PATH, OK, NOUT ) +* + RETURN +* +* End of CERRUNHR_COL +* + END diff --git a/lapack-netlib/TESTING/LIN/cerrvx.f b/lapack-netlib/TESTING/LIN/cerrvx.f index d2d3d2a85..7f929f07f 100644 --- a/lapack-netlib/TESTING/LIN/cerrvx.f +++ b/lapack-netlib/TESTING/LIN/cerrvx.f @@ -739,7 +739,7 @@ $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 - CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1, + CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 7 @@ -769,7 +769,7 @@ $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 - CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1, + CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 7 diff --git a/lapack-netlib/TESTING/LIN/clahilb.f b/lapack-netlib/TESTING/LIN/clahilb.f index f88491a0d..c54884b9f 100644 --- a/lapack-netlib/TESTING/LIN/clahilb.f +++ b/lapack-netlib/TESTING/LIN/clahilb.f @@ -164,7 +164,7 @@ INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8) * -* d's are generated from random permuation of those eight elements. +* d's are generated from random permutation of those eight elements. COMPLEX D1(8), D2(8), INVD1(8), INVD2(8) DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/ DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/ diff --git a/lapack-netlib/TESTING/LIN/ctsqr01.f b/lapack-netlib/TESTING/LIN/ctsqr01.f index a3bd9ebc9..6d788ba41 100644 --- a/lapack-netlib/TESTING/LIN/ctsqr01.f +++ b/lapack-netlib/TESTING/LIN/ctsqr01.f @@ -114,7 +114,7 @@ * .. * .. Local Arrays .. INTEGER ISEED( 4 ) - COMPLEX TQUERY( 5 ), WORKQUERY + COMPLEX TQUERY( 5 ), WORKQUERY( 1 ) * .. * .. External Functions .. REAL SLAMCH, CLANGE, CLANSY @@ -173,22 +173,22 @@ * CALL CGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 ) ) CALL CGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMQR( 'L', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMQR( 'R', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'CGEQR' @@ -316,22 +316,22 @@ ELSE CALL CGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 ) ) CALL CGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N, $ WORKQUERY, -1, INFO ) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMLQ( 'L', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL CGEMLQ( 'R', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'CGELQ' diff --git a/lapack-netlib/TESTING/LIN/cunhr_col01.f b/lapack-netlib/TESTING/LIN/cunhr_col01.f new file mode 100644 index 000000000..d760caba5 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f @@ -0,0 +1,390 @@ +*> \brief \b CUNHR_COL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) +*> +*> RESULT(1) = | A - Q * R | / (eps * m * |A|) +*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) +*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) +*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) +*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) +*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + REAL, ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, CLANGE, CLANSY + EXTERNAL SLAMCH, CLANGE, CLANSY +* .. +* .. External Subroutines .. + EXTERNAL CLACPY, CLARNV, CLASET, CLATSQR, CUNHR_COL, + $ CUNGTSQR, CSCAL, CGEMM, CGEMQRT, CHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL CLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in CLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* CLATSQR requires NB1 to be bounded by N. +* + NB1_UB = MIN( NB1, N) +* +* CGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* + CALL CLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, + $ WORKQUERY, -1, INFO ) + LWORK = INT( WORKQUERY( 1 ) ) + CALL CUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1, + $ INFO ) + + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) +* +* In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'CLATSQR' + CALL CLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Copy the factor R into the array R. +* + SRNAMT = 'CLACPY' + CALL CLACPY( 'U', M, N, AF, M, R, M ) +* +* Reconstruct the orthogonal matrix Q. +* + SRNAMT = 'CUNGTSQR' + CALL CUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Perform the Householder reconstruction, the result is stored +* the arrays AF and T2. +* + SRNAMT = 'CUNHR_COL' + CALL CUNHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO ) +* +* Compute the factor R_hr corresponding to the Householder +* reconstructed Q_hr and place it in the upper triangle of AF to +* match the Q storage format in CGEQRT. R_hr = R_tsqr * S, +* this means changing the sign of I-th row of the matrix R_tsqr +* according to sign of of I-th diagonal element DIAG(I) of the +* matrix S. +* + SRNAMT = 'CLACPY' + CALL CLACPY( 'U', M, N, R, M, AF, M ) +* + DO I = 1, N + IF( DIAG( I ).EQ.-CONE ) THEN + CALL CSCAL( N+1-I, -CONE, AF( I, I ), M ) + END IF + END DO +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL CLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**H)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = CLANGE( '1', M, N, A, M, RWORK ) + RESID = CLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**H)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL CLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = CLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = CLANGE( '1', M, N, C, M, RWORK ) + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**H)*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**H)*C| / ( eps * m * |C|) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL CLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = CLANGE( '1', N, M, D, N, RWORK ) + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**H)| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of CUNHR_COL01 +* + END diff --git a/lapack-netlib/TESTING/LIN/dchkaa.f b/lapack-netlib/TESTING/LIN/dchkaa.f index c5fd7afda..03575c4d1 100644 --- a/lapack-netlib/TESTING/LIN/dchkaa.f +++ b/lapack-netlib/TESTING/LIN/dchkaa.f @@ -68,6 +68,10 @@ *> DEQ *> DQT *> DQX +*> DTQ +*> DXQ +*> DTS +*> DHH *> \endverbatim * * Parameters: @@ -102,17 +106,17 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date April 2012 +*> \date November 2019 * *> \ingroup double_lin * * ===================================================================== PROGRAM DCHKAA * -* -- LAPACK test routine (version 3.8.0) -- +* -- LAPACK test routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* April 2012 +* Novemebr 2019 * * ===================================================================== * @@ -159,15 +163,14 @@ * .. * .. External Subroutines .. EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, - $ DCHKPB, DCHKPO, DCHKPS, DCHKPP, DCHKPT, DCHKQ3, - $ DCHKQL, DCHKQR, DCHKRQ, DCHKSP, DCHKSY, - $ DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, DCHKTB, - $ DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, - $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, - $ DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, - $ DDRVSY_AA, ILAVER, DCHKQRT, - $ DCHKQRTP, DCHKLQTP, DCHKTSQR, DCHKLQT - + $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, + $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, + $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, + $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, + $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, + $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, + $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, + $ DCHKLQT,DCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -1007,8 +1010,20 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF * ELSE + * WRITE( NOUT, FMT = 9990 )PATH END IF diff --git a/lapack-netlib/TESTING/LIN/dchkorhr_col.f b/lapack-netlib/TESTING/LIN/dchkorhr_col.f new file mode 100644 index 000000000..3b3e421eb --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f @@ -0,0 +1,239 @@ +*> \brief \b DCHKORHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, +* NBVAL, NOUT ) +* +* .. Scalar Arguments .. +* LOGICAL TSTERR +* INTEGER NM, NN, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR +*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested +*> before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[in] TSTERR +*> \verbatim +*> TSTERR is LOGICAL +*> Flag that indicates whether error exits are to be tested. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB contained in the vector NBVAL. +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NBVAL) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* December 2016 +* +* .. Scalar Arguments .. + LOGICAL TSTERR + INTEGER NM, NN, NNB, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTESTS + PARAMETER ( NTESTS = 6 ) +* .. +* .. Local Scalars .. + CHARACTER(LEN=3) PATH + INTEGER I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1, + $ NB2, NFAIL, NERRS, NRUN +* +* .. Local Arrays .. + DOUBLE PRECISION RESULT( NTESTS ) +* .. +* .. External Subroutines .. + EXTERNAL ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01 +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Executable Statements .. +* +* Initialize constants +* + PATH( 1: 1 ) = 'D' + PATH( 2: 3 ) = 'HH' + NRUN = 0 + NFAIL = 0 + NERRS = 0 +* +* Test the error exits +* + IF( TSTERR ) CALL DERRORHR_COL( PATH, NOUT ) + INFOT = 0 +* +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test DORHR_COL +* + CALL DORHR_COL01( M, N, MB1, NB1, NB2, + $ RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, + $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + RETURN +* +* End of DCHKORHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/TESTING/LIN/ddrvls.f b/lapack-netlib/TESTING/LIN/ddrvls.f index 2f4975553..adfd71e09 100644 --- a/lapack-netlib/TESTING/LIN/ddrvls.f +++ b/lapack-netlib/TESTING/LIN/ddrvls.f @@ -233,8 +233,8 @@ DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. - INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ - DOUBLE PRECISION RESULT( NTESTS ), WQ + INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 ) + DOUBLE PRECISION RESULT( NTESTS ), WQ( 1 ) * .. * .. Allocatable Arrays .. DOUBLE PRECISION, ALLOCATABLE :: WORK (:) @@ -359,27 +359,27 @@ * Compute workspace needed for DGELS CALL DGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) - LWORK_DGELS = INT ( WQ ) + LWORK_DGELS = INT ( WQ ( 1 ) ) * Compute workspace needed for DGETSLS CALL DGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) - LWORK_DGETSLS = INT( WQ ) + LWORK_DGETSLS = INT( WQ ( 1 ) ) ENDDO END IF * Compute workspace needed for DGELSY CALL DGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ, $ RCOND, CRANK, WQ, -1, INFO ) - LWORK_DGELSY = INT( WQ ) + LWORK_DGELSY = INT( WQ ( 1 ) ) * Compute workspace needed for DGELSS CALL DGELSS( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1 , INFO ) - LWORK_DGELSS = INT( WQ ) + LWORK_DGELSS = INT( WQ ( 1 ) ) * Compute workspace needed for DGELSD CALL DGELSD( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1, IWQ, INFO ) - LWORK_DGELSD = INT( WQ ) + LWORK_DGELSD = INT( WQ ( 1 ) ) * Compute LIWORK workspace needed for DGELSY and DGELSD - LIWORK = MAX( LIWORK, N, IWQ ) + LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGETSLS, $ LWORK_DGELSY, LWORK_DGELSS, diff --git a/lapack-netlib/TESTING/LIN/derrorhr_col.f b/lapack-netlib/TESTING/LIN/derrorhr_col.f new file mode 100644 index 000000000..6d545bc91 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/derrorhr_col.f @@ -0,0 +1,164 @@ +*> \brief \b DERRORHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DERRORHR_COL( PATH, NUNIT ) +* +* .. Scalar Arguments .. +* CHARACTER*3 PATH +* INTEGER NUNIT +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DERRORHR_COL tests the error exits for DORHR_COL that does +*> Householder reconstruction from the ouput of tall-skinny +*> factorization DLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] PATH +*> \verbatim +*> PATH is CHARACTER*3 +*> The LAPACK path name for the routines to be tested. +*> \endverbatim +*> +*> \param[in] NUNIT +*> \verbatim +*> NUNIT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DERRORHR_COL( PATH, NUNIT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + CHARACTER(LEN=3) PATH + INTEGER NUNIT +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 2 ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, J +* .. +* .. Local Arrays .. + DOUBLE PRECISION A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX) +* .. +* .. External Subroutines .. + EXTERNAL ALAESM, CHKXER, DORHR_COL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NOUT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NOUT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE +* .. +* .. Executable Statements .. +* + NOUT = NUNIT + WRITE( NOUT, FMT = * ) +* +* Set the variables to innocuous values. +* + DO J = 1, NMAX + DO I = 1, NMAX + A( I, J ) = 1.D+0 / DBLE( I+J ) + T( I, J ) = 1.D+0 / DBLE( I+J ) + END DO + D( J ) = 0.D+0 + END DO + OK = .TRUE. +* +* Error exits for Householder reconstruction +* +* DORHR_COL +* + SRNAMT = 'DORHR_COL' +* + INFOT = 1 + CALL DORHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 2 + CALL DORHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) + CALL DORHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 3 + CALL DORHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL DORHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 5 + CALL DORHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL DORHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL DORHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 7 + CALL DORHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL DORHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL DORHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO ) + CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK ) +* +* Print a summary line. +* + CALL ALAESM( PATH, OK, NOUT ) +* + RETURN +* +* End of DERRORHR_COL +* + END diff --git a/lapack-netlib/TESTING/LIN/derrvx.f b/lapack-netlib/TESTING/LIN/derrvx.f index 3a4a6b7fc..fd1d038a6 100644 --- a/lapack-netlib/TESTING/LIN/derrvx.f +++ b/lapack-netlib/TESTING/LIN/derrvx.f @@ -740,7 +740,7 @@ $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 - CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1, + CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 7 diff --git a/lapack-netlib/TESTING/LIN/dorhr_col01.f b/lapack-netlib/TESTING/LIN/dorhr_col01.f new file mode 100644 index 000000000..3e48de37f --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f @@ -0,0 +1,386 @@ +*> \brief \b DORHR_COL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) +*> +*> RESULT(1) = | A - Q * R | / (eps * m * |A|) +*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) +*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) +*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) +*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) +*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + DOUBLE PRECISION, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + DOUBLE PRECISION WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DLANGE, DLANSY + EXTERNAL DLAMCH, DLANGE, DLANSY +* .. +* .. External Subroutines .. + EXTERNAL DLACPY, DLARNV, DLASET, DLATSQR, DORHR_COL, + $ DORGTSQR, DSCAL, DGEMM, DGEMQRT, DSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL DLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in DLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* DLATSQR requires NB1 to be bounded by N. +* + NB1_UB = MIN( NB1, N) +* +* DGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* + CALL DLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, + $ WORKQUERY, -1, INFO ) + LWORK = INT( WORKQUERY( 1 ) ) + CALL DORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1, + $ INFO ) + + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) +* +* In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'DLATSQR' + CALL DLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Copy the factor R into the array R. +* + SRNAMT = 'DLACPY' + CALL DLACPY( 'U', N, N, AF, M, R, M ) +* +* Reconstruct the orthogonal matrix Q. +* + SRNAMT = 'DORGTSQR' + CALL DORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Perform the Householder reconstruction, the result is stored +* the arrays AF and T2. +* + SRNAMT = 'DORHR_COL' + CALL DORHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO ) +* +* Compute the factor R_hr corresponding to the Householder +* reconstructed Q_hr and place it in the upper triangle of AF to +* match the Q storage format in DGEQRT. R_hr = R_tsqr * S, +* this means changing the sign of I-th row of the matrix R_tsqr +* according to sign of of I-th diagonal element DIAG(I) of the +* matrix S. +* + SRNAMT = 'DLACPY' + CALL DLACPY( 'U', N, N, R, M, AF, M ) +* + DO I = 1, N + IF( DIAG( I ).EQ.-ONE ) THEN + CALL DSCAL( N+1-I, -ONE, AF( I, I ), M ) + END IF + END DO +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL DLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = DLANGE( '1', M, N, A, M, RWORK ) + RESID = DLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL DLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = DLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = DLANGE( '1', M, N, C, M, RWORK ) + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL DLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = DLANGE( '1', N, M, D, N, RWORK ) + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of DORHR_COL01 +* + END diff --git a/lapack-netlib/TESTING/LIN/dtsqr01.f b/lapack-netlib/TESTING/LIN/dtsqr01.f index 7a50009cc..25bf58a81 100644 --- a/lapack-netlib/TESTING/LIN/dtsqr01.f +++ b/lapack-netlib/TESTING/LIN/dtsqr01.f @@ -115,7 +115,7 @@ * .. * .. Local Arrays .. INTEGER ISEED( 4 ) - DOUBLE PRECISION TQUERY( 5 ), WORKQUERY + DOUBLE PRECISION TQUERY( 5 ), WORKQUERY( 1 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLANGE, DLANSY @@ -174,22 +174,22 @@ * CALL DGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 ) ) CALL DGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMQR( 'L', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMQR( 'R', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'DGEQR' @@ -317,22 +317,22 @@ ELSE CALL DGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 ) ) CALL DGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N, $ WORKQUERY, -1, INFO ) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMLQ( 'L', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL DGEMLQ( 'R', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'DGELQ' diff --git a/lapack-netlib/TESTING/LIN/schkaa.f b/lapack-netlib/TESTING/LIN/schkaa.f index 33b109aa7..a9c13e442 100644 --- a/lapack-netlib/TESTING/LIN/schkaa.f +++ b/lapack-netlib/TESTING/LIN/schkaa.f @@ -68,6 +68,8 @@ *> SEQ *> SQT *> SQX +*> STS +*> SHH *> \endverbatim * * Parameters: @@ -102,17 +104,17 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date April 2012 +*> \date November 2019 * *> \ingroup single_lin * * ===================================================================== PROGRAM SCHKAA * -* -- LAPACK test routine (version 3.8.0) -- +* -- LAPACK test routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* April 2012 +* November 2019 * * ===================================================================== * @@ -159,13 +161,13 @@ * .. * .. External Subroutines .. EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, - $ SCHKPB, SCHKPO, SCHKPS, SCHKPP, SCHKPT, SCHKQ3, - $ SCHKQL, SCHKQR, SCHKRQ, SCHKSP, SCHKSY, - $ SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, SCHKTB, - $ SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, SDRVGT, - $ SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, SDRVSP, - $ SDRVSY, SDRVSY_ROOK, SDRVSY_RK, SDRVSY_AA, - $ ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, + $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, + $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, + $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, + $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, + $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, + $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, + $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, $ SCHKLQT, SCHKTSQR * .. * .. Scalars in Common .. @@ -673,7 +675,7 @@ * * SK: symmetric indefinite matrices, * with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than SR path version. +* different matrix storage format than SR path version. * NTYPES = 10 CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) @@ -1004,6 +1006,17 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF * ELSE * diff --git a/lapack-netlib/TESTING/LIN/schkorhr_col.f b/lapack-netlib/TESTING/LIN/schkorhr_col.f new file mode 100644 index 000000000..cf6d2d323 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f @@ -0,0 +1,239 @@ +*> \brief \b SCHKORHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, +* NBVAL, NOUT ) +* +* .. Scalar Arguments .. +* LOGICAL TSTERR +* INTEGER NM, NN, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[in] TSTERR +*> \verbatim +*> TSTERR is LOGICAL +*> Flag that indicates whether error exits are to be tested. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB contained in the vector NBVAL. +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NBVAL) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup sigle_lin +* +* ===================================================================== + SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2019 +* +* .. Scalar Arguments .. + LOGICAL TSTERR + INTEGER NM, NN, NNB, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTESTS + PARAMETER ( NTESTS = 6 ) +* .. +* .. Local Scalars .. + CHARACTER(LEN=3) PATH + INTEGER I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1, + $ NB2, NFAIL, NERRS, NRUN +* +* .. Local Arrays .. + REAL RESULT( NTESTS ) +* .. +* .. External Subroutines .. + EXTERNAL ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01 +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Executable Statements .. +* +* Initialize constants +* + PATH( 1: 1 ) = 'S' + PATH( 2: 3 ) = 'HH' + NRUN = 0 + NFAIL = 0 + NERRS = 0 +* +* Test the error exits +* + IF( TSTERR ) CALL SERRORHR_COL( PATH, NOUT ) + INFOT = 0 +* +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test SORHR_COL +* + CALL SORHR_COL01( M, N, MB1, NB1, NB2, + $ RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, + $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + RETURN +* +* End of SCHKORHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/TESTING/LIN/sdrvls.f b/lapack-netlib/TESTING/LIN/sdrvls.f index 2cf3439b5..649ca558c 100644 --- a/lapack-netlib/TESTING/LIN/sdrvls.f +++ b/lapack-netlib/TESTING/LIN/sdrvls.f @@ -233,8 +233,8 @@ REAL EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. - INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ - REAL RESULT( NTESTS ), WQ + INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 ) + REAL RESULT( NTESTS ), WQ( 1 ) * .. * .. Allocatable Arrays .. REAL, ALLOCATABLE :: WORK (:) @@ -358,28 +358,28 @@ * * Compute workspace needed for SGELS CALL SGELS( TRANS, M, N, NRHS, A, LDA, - $ B, LDB, WQ, -1, INFO ) - LWORK_SGELS = INT ( WQ ) + $ B, LDB, WQ( 1 ), -1, INFO ) + LWORK_SGELS = INT ( WQ( 1 ) ) * Compute workspace needed for SGETSLS CALL SGETSLS( TRANS, M, N, NRHS, A, LDA, - $ B, LDB, WQ, -1, INFO ) - LWORK_SGETSLS = INT( WQ ) + $ B, LDB, WQ( 1 ), -1, INFO ) + LWORK_SGETSLS = INT( WQ( 1 ) ) ENDDO END IF * Compute workspace needed for SGELSY CALL SGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ, $ RCOND, CRANK, WQ, -1, INFO ) - LWORK_SGELSY = INT( WQ ) + LWORK_SGELSY = INT( WQ( 1 ) ) * Compute workspace needed for SGELSS CALL SGELSS( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1 , INFO ) - LWORK_SGELSS = INT( WQ ) + LWORK_SGELSS = INT( WQ( 1 ) ) * Compute workspace needed for SGELSD CALL SGELSD( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1, IWQ, INFO ) - LWORK_SGELSD = INT( WQ ) + LWORK_SGELSD = INT( WQ( 1 ) ) * Compute LIWORK workspace needed for SGELSY and SGELSD - LIWORK = MAX( LIWORK, N, IWQ ) + LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGETSLS, $ LWORK_SGELSY, LWORK_SGELSS, diff --git a/lapack-netlib/TESTING/LIN/serrorhr_col.f b/lapack-netlib/TESTING/LIN/serrorhr_col.f new file mode 100644 index 000000000..e8d81a99c --- /dev/null +++ b/lapack-netlib/TESTING/LIN/serrorhr_col.f @@ -0,0 +1,164 @@ +*> \brief \b SERRORHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SERRORHR_COL( PATH, NUNIT ) +* +* .. Scalar Arguments .. +* CHARACTER*3 PATH +* INTEGER NUNIT +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SERRORHR_COL tests the error exits for SORHR_COL that does +*> Householder reconstruction from the ouput of tall-skinny +*> factorization SLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] PATH +*> \verbatim +*> PATH is CHARACTER*3 +*> The LAPACK path name for the routines to be tested. +*> \endverbatim +*> +*> \param[in] NUNIT +*> \verbatim +*> NUNIT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup singlr_lin +* +* ===================================================================== + SUBROUTINE SERRORHR_COL( PATH, NUNIT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + CHARACTER(LEN=3) PATH + INTEGER NUNIT +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 2 ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, J +* .. +* .. Local Arrays .. + REAL A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX) +* .. +* .. External Subroutines .. + EXTERNAL ALAESM, CHKXER, SORHR_COL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NOUT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NOUT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. +* .. Executable Statements .. +* + NOUT = NUNIT + WRITE( NOUT, FMT = * ) +* +* Set the variables to innocuous values. +* + DO J = 1, NMAX + DO I = 1, NMAX + A( I, J ) = 1.E+0 / REAL( I+J ) + T( I, J ) = 1.E+0 / REAL( I+J ) + END DO + D( J ) = 0.E+0 + END DO + OK = .TRUE. +* +* Error exits for Householder reconstruction +* +* SORHR_COL +* + SRNAMT = 'SORHR_COL' +* + INFOT = 1 + CALL SORHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 2 + CALL SORHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) + CALL SORHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 3 + CALL SORHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL SORHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 5 + CALL SORHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL SORHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL SORHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 7 + CALL SORHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL SORHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL SORHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO ) + CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK ) +* +* Print a summary line. +* + CALL ALAESM( PATH, OK, NOUT ) +* + RETURN +* +* End of SERRORHR_COL +* + END diff --git a/lapack-netlib/TESTING/LIN/serrvx.f b/lapack-netlib/TESTING/LIN/serrvx.f index a63ed38d7..910bff1e5 100644 --- a/lapack-netlib/TESTING/LIN/serrvx.f +++ b/lapack-netlib/TESTING/LIN/serrvx.f @@ -735,7 +735,7 @@ $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 - CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1, + CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 7 diff --git a/lapack-netlib/TESTING/LIN/sorhr_col01.f b/lapack-netlib/TESTING/LIN/sorhr_col01.f new file mode 100644 index 000000000..02429041b --- /dev/null +++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f @@ -0,0 +1,386 @@ +*> \brief \b SORHR_COL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) +*> +*> RESULT(1) = | A - Q * R | / (eps * m * |A|) +*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) +*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) +*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) +*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) +*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + REAL, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + REAL WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, SLANGE, SLANSY + EXTERNAL SLAMCH, SLANGE, SLANSY +* .. +* .. External Subroutines .. + EXTERNAL SLACPY, SLARNV, SLASET, SLATSQR, SORHR_COL, + $ SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, MAX, MIN, REAL +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL SLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in SLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* SLATSQR requires NB1 to be bounded by N. +* + NB1_UB = MIN( NB1, N) +* +* SGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* + CALL SLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, + $ WORKQUERY, -1, INFO ) + LWORK = INT( WORKQUERY( 1 ) ) + CALL SORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1, + $ INFO ) + + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) +* +* In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'SLATSQR' + CALL SLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Copy the factor R into the array R. +* + SRNAMT = 'SLACPY' + CALL SLACPY( 'U', N, N, AF, M, R, M ) +* +* Reconstruct the orthogonal matrix Q. +* + SRNAMT = 'SORGTSQR' + CALL SORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Perform the Householder reconstruction, the result is stored +* the arrays AF and T2. +* + SRNAMT = 'SORHR_COL' + CALL SORHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO ) +* +* Compute the factor R_hr corresponding to the Householder +* reconstructed Q_hr and place it in the upper triangle of AF to +* match the Q storage format in DGEQRT. R_hr = R_tsqr * S, +* this means changing the sign of I-th row of the matrix R_tsqr +* according to sign of of I-th diagonal element DIAG(I) of the +* matrix S. +* + SRNAMT = 'SLACPY' + CALL SLACPY( 'U', N, N, R, M, AF, M ) +* + DO I = 1, N + IF( DIAG( I ).EQ.-ONE ) THEN + CALL SSCAL( N+1-I, -ONE, AF( I, I ), M ) + END IF + END DO +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL SLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = SLANGE( '1', M, N, A, M, RWORK ) + RESID = SLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL SLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = SLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = SLANGE( '1', M, N, C, M, RWORK ) + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL SLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = SLANGE( '1', N, M, D, N, RWORK ) + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of SORHR_COL01 +* + END diff --git a/lapack-netlib/TESTING/LIN/stsqr01.f b/lapack-netlib/TESTING/LIN/stsqr01.f index b661d61f4..8eb69eae7 100644 --- a/lapack-netlib/TESTING/LIN/stsqr01.f +++ b/lapack-netlib/TESTING/LIN/stsqr01.f @@ -115,7 +115,7 @@ * .. * .. Local Arrays .. INTEGER ISEED( 4 ) - REAL TQUERY( 5 ), WORKQUERY + REAL TQUERY( 5 ), WORKQUERY( 1 ) * .. * .. External Functions .. REAL SLAMCH, SLANGE, SLANSY @@ -174,22 +174,22 @@ * CALL SGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 ) ) CALL SGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMQR( 'L', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMQR( 'R', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'SGEQR' @@ -317,22 +317,22 @@ ELSE CALL SGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 )) CALL SGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N, $ WORKQUERY, -1, INFO ) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMLQ( 'L', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL SGEMLQ( 'R', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'SGELQ' diff --git a/lapack-netlib/TESTING/LIN/zchkaa.f b/lapack-netlib/TESTING/LIN/zchkaa.f index d2be2525d..30d2a084a 100644 --- a/lapack-netlib/TESTING/LIN/zchkaa.f +++ b/lapack-netlib/TESTING/LIN/zchkaa.f @@ -74,6 +74,8 @@ *> ZEQ *> ZQT *> ZQX +*> ZTS +*> ZHH *> \endverbatim * * Parameters: @@ -108,17 +110,17 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2017 +*> \date November 2019 * *> \ingroup complex16_lin * * ===================================================================== PROGRAM ZCHKAA * -* -- LAPACK test routine (version 3.8.0) -- +* -- LAPACK test routine (version 3.9.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2017 +* November 2019 * * ===================================================================== * @@ -166,16 +168,16 @@ * .. External Subroutines .. EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, - $ ZCHKLQ, ZCHKPB, ZCHKPO, ZCHKPS, ZCHKPP, ZCHKPT, - $ ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY, - $ ZCHKSY_ROOK, ZCHKSY_RK, ZCHKSY_AA, ZCHKTB, - $ ZCHKTP, ZCHKTR, ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT, - $ ZDRVHE, ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA, - $ ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB, - $ ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY, - $ ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA, - $ ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP, - $ ZCHKLQT, ZCHKLQTP, ZCHKTSQR + $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, + $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, + $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, + $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, + $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, + $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, + $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, + $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, + $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, + $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -679,7 +681,7 @@ * * HK: Hermitian indefinite matrices, * with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than HR path version. +* different matrix storage format than HR path version. * NTYPES = 10 CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) @@ -839,7 +841,7 @@ * * SK: symmetric indefinite matrices, * with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than SR path version. +* different matrix storage format than SR path version. * NTYPES = 11 CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) @@ -1201,6 +1203,17 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF * ELSE * diff --git a/lapack-netlib/TESTING/LIN/zchkunhr_col.f b/lapack-netlib/TESTING/LIN/zchkunhr_col.f new file mode 100644 index 000000000..ef8f8bcc4 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f @@ -0,0 +1,239 @@ +*> \brief \b ZCHKUNHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, +* NBVAL, NOUT ) +* +* .. Scalar Arguments .. +* LOGICAL TSTERR +* INTEGER NM, NN, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR +*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested +*> before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[in] TSTERR +*> \verbatim +*> TSTERR is LOGICAL +*> Flag that indicates whether error exits are to be tested. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB contained in the vector NBVAL. +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NBVAL) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* December 2016 +* +* .. Scalar Arguments .. + LOGICAL TSTERR + INTEGER NM, NN, NNB, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER MVAL( * ), NBVAL( * ), NVAL( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTESTS + PARAMETER ( NTESTS = 6 ) +* .. +* .. Local Scalars .. + CHARACTER(LEN=3) PATH + INTEGER I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1, + $ NB2, NFAIL, NERRS, NRUN +* +* .. Local Arrays .. + DOUBLE PRECISION RESULT( NTESTS ) +* .. +* .. External Subroutines .. + EXTERNAL ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01 +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Executable Statements .. +* +* Initialize constants +* + PATH( 1: 1 ) = 'Z' + PATH( 2: 3 ) = 'HH' + NRUN = 0 + NFAIL = 0 + NERRS = 0 +* +* Test the error exits +* + IF( TSTERR ) CALL ZERRUNHR_COL( PATH, NOUT ) + INFOT = 0 +* +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test ZUNHR_COL +* + CALL ZUNHR_COL01( M, N, MB1, NB1, NB2, + $ RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, + $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + RETURN +* +* End of ZCHKUNHR_COL +* + END \ No newline at end of file diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_rk.f b/lapack-netlib/TESTING/LIN/zdrvhe_rk.f index 93c3fe61d..355260aad 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhe_rk.f +++ b/lapack-netlib/TESTING/LIN/zdrvhe_rk.f @@ -98,6 +98,7 @@ *> \param[out] E *> \verbatim *> E is COMPLEX*16 array, dimension (NMAX) +*> \endverbatim *> *> \param[out] AINV *> \verbatim diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f index 681852bc2..4587c5686 100644 --- a/lapack-netlib/TESTING/LIN/zdrvls.f +++ b/lapack-netlib/TESTING/LIN/zdrvls.f @@ -237,13 +237,13 @@ DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. - INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ - DOUBLE PRECISION RESULT( NTESTS ), RWQ - COMPLEX*16 WQ + INTEGER ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 ) + DOUBLE PRECISION RESULT( NTESTS ), RWQ( 1 ) + COMPLEX*16 WQ( 1 ) * .. * .. Allocatable Arrays .. COMPLEX*16, ALLOCATABLE :: WORK (:) - DOUBLE PRECISION, ALLOCATABLE :: RWORK (:) + DOUBLE PRECISION, ALLOCATABLE :: RWORK (:), WORK2 (:) INTEGER, ALLOCATABLE :: IWORK (:) * .. * .. External Functions .. @@ -363,32 +363,32 @@ * Compute workspace needed for ZGELS CALL ZGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) - LWORK_ZGELS = INT ( WQ ) + LWORK_ZGELS = INT ( WQ( 1 ) ) * Compute workspace needed for ZGETSLS CALL ZGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) - LWORK_ZGETSLS = INT( WQ ) + LWORK_ZGETSLS = INT( WQ( 1 ) ) ENDDO END IF * Compute workspace needed for ZGELSY CALL ZGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ, $ RCOND, CRANK, WQ, -1, RWORK, INFO ) - LWORK_ZGELSY = INT( WQ ) + LWORK_ZGELSY = INT( WQ( 1 ) ) LRWORK_ZGELSY = 2*N * Compute workspace needed for ZGELSS CALL ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1 , RWORK, $ INFO ) - LWORK_ZGELSS = INT( WQ ) + LWORK_ZGELSS = INT( WQ( 1 ) ) LRWORK_ZGELSS = 5*MNMIN * Compute workspace needed for ZGELSD CALL ZGELSD( M, N, NRHS, A, LDA, B, LDB, S, $ RCOND, CRANK, WQ, -1, RWQ, IWQ, $ INFO ) - LWORK_ZGELSD = INT( WQ ) - LRWORK_ZGELSD = INT( RWQ ) + LWORK_ZGELSD = INT( WQ( 1 ) ) + LRWORK_ZGELSD = INT( RWQ ( 1 ) ) * Compute LIWORK workspace needed for ZGELSY and ZGELSD - LIWORK = MAX( LIWORK, N, IWQ ) + LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LRWORK workspace needed for ZGELSY, ZGELSS and ZGELSD LRWORK = MAX( LRWORK, LRWORK_ZGELSY, $ LRWORK_ZGELSS, LRWORK_ZGELSD ) @@ -406,6 +406,7 @@ LWLSY = LWORK * ALLOCATE( WORK( LWORK ) ) + ALLOCATE( WORK2( 2 * LWORK ) ) ALLOCATE( IWORK( LIWORK ) ) ALLOCATE( RWORK( LRWORK ) ) * @@ -596,7 +597,7 @@ $ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL ZQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK, + $ LDA, B, LDB, C, LDB, WORK2, $ RESULT( 15 ) ) * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. diff --git a/lapack-netlib/TESTING/LIN/zerrunhr_col.f b/lapack-netlib/TESTING/LIN/zerrunhr_col.f new file mode 100644 index 000000000..4fb62734d --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zerrunhr_col.f @@ -0,0 +1,164 @@ +*> \brief \b ZERRUNHR_COL +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZERRUNHR_COL( PATH, NUNIT ) +* +* .. Scalar Arguments .. +* CHARACTER*3 PATH +* INTEGER NUNIT +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZERRUNHR_COL tests the error exits for ZUNHR_COL that does +*> Householder reconstruction from the ouput of tall-skinny +*> factorization ZLATSQR. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] PATH +*> \verbatim +*> PATH is CHARACTER*3 +*> The LAPACK path name for the routines to be tested. +*> \endverbatim +*> +*> \param[in] NUNIT +*> \verbatim +*> NUNIT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZERRUNHR_COL( PATH, NUNIT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + CHARACTER(LEN=3) PATH + INTEGER NUNIT +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 2 ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, J +* .. +* .. Local Arrays .. + COMPLEX*16 A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX) +* .. +* .. External Subroutines .. + EXTERNAL ALAESM, CHKXER, ZUNHR_COL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER(LEN=32) SRNAMT + INTEGER INFOT, NOUT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NOUT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCMPLX +* .. +* .. Executable Statements .. +* + NOUT = NUNIT + WRITE( NOUT, FMT = * ) +* +* Set the variables to innocuous values. +* + DO J = 1, NMAX + DO I = 1, NMAX + A( I, J ) = DCMPLX( 1.D+0 / DBLE( I+J ) ) + T( I, J ) = DCMPLX( 1.D+0 / DBLE( I+J ) ) + END DO + D( J ) = ( 0.D+0, 0.D+0 ) + END DO + OK = .TRUE. +* +* Error exits for Householder reconstruction +* +* ZUNHR_COL +* + SRNAMT = 'ZUNHR_COL' +* + INFOT = 1 + CALL ZUNHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 2 + CALL ZUNHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) + CALL ZUNHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 3 + CALL ZUNHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL ZUNHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 5 + CALL ZUNHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL ZUNHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL ZUNHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + INFOT = 7 + CALL ZUNHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL ZUNHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* + CALL ZUNHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO ) + CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK ) +* +* Print a summary line. +* + CALL ALAESM( PATH, OK, NOUT ) +* + RETURN +* +* End of ZERRUNHR_COL +* + END diff --git a/lapack-netlib/TESTING/LIN/zerrvx.f b/lapack-netlib/TESTING/LIN/zerrvx.f index 29ba744ed..7759384e6 100644 --- a/lapack-netlib/TESTING/LIN/zerrvx.f +++ b/lapack-netlib/TESTING/LIN/zerrvx.f @@ -94,7 +94,7 @@ $ ZHPSV, ZHPSVX, ZPBSV, ZPBSVX, ZPOSV, ZPOSVX, $ ZPPSV, ZPPSVX, ZPTSV, ZPTSVX, ZSPSV, ZSPSVX, $ ZSYSV, ZSYSV_AA, ZSYSV_RK, ZSYSV_ROOK, - $ ZSYSVX, ZSYSV_AA_2STAGE + $ ZSYSVX, ZHESV_AA_2STAGE * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -721,7 +721,7 @@ * ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN * -* CHESV_AASEN_2STAGE +* ZHESV_AASEN_2STAGE * SRNAMT = 'ZHESV_AA_2STAGE' INFOT = 1 @@ -741,7 +741,7 @@ $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 - CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1, + CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 7 @@ -749,6 +749,36 @@ $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) * + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* ZSYSV_AASEN_2STAGE +* + SRNAMT = 'ZSYSV_AA_2STAGE' + INFOT = 1 + CALL ZSYSV_AA_2STAGE( '/', 0, 0, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYSV_AA_2STAGE( 'U', -1, 0, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYSV_AA_2STAGE( 'U', 0, -1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) +** ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * ZHPSV diff --git a/lapack-netlib/TESTING/LIN/zlahilb.f b/lapack-netlib/TESTING/LIN/zlahilb.f index a6dc79b20..ba83af825 100644 --- a/lapack-netlib/TESTING/LIN/zlahilb.f +++ b/lapack-netlib/TESTING/LIN/zlahilb.f @@ -164,7 +164,7 @@ INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8) * -* d's are generated from random permuation of those eight elements. +* d's are generated from random permutation of those eight elements. COMPLEX*16 d1(8), d2(8), invd1(8), invd2(8) DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/ DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/ diff --git a/lapack-netlib/TESTING/LIN/ztsqr01.f b/lapack-netlib/TESTING/LIN/ztsqr01.f index 094473888..81d7fdb44 100644 --- a/lapack-netlib/TESTING/LIN/ztsqr01.f +++ b/lapack-netlib/TESTING/LIN/ztsqr01.f @@ -114,7 +114,7 @@ * .. * .. Local Arrays .. INTEGER ISEED( 4 ) - COMPLEX*16 TQUERY( 5 ), WORKQUERY + COMPLEX*16 TQUERY( 5 ), WORKQUERY( 1 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, ZLANGE, ZLANSY @@ -173,22 +173,22 @@ * CALL ZGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 ) ) CALL ZGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMQR( 'L', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMQR( 'R', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'ZGEQR' @@ -316,22 +316,22 @@ ELSE CALL ZGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO ) TSIZE = INT( TQUERY( 1 ) ) - LWORK = INT( WORKQUERY ) + LWORK = INT( WORKQUERY( 1 ) ) CALL ZGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N, $ WORKQUERY, -1, INFO ) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMLQ( 'L', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) CALL ZGEMLQ( 'R', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M, $ WORKQUERY, -1, INFO) - LWORK = MAX( LWORK, INT( WORKQUERY ) ) + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) ALLOCATE ( T( TSIZE ) ) ALLOCATE ( WORK( LWORK ) ) srnamt = 'ZGELQ' diff --git a/lapack-netlib/TESTING/LIN/zunhr_col01.f b/lapack-netlib/TESTING/LIN/zunhr_col01.f new file mode 100644 index 000000000..9fb3bf352 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f @@ -0,0 +1,390 @@ +*> \brief \b ZUNHR_COL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) +*> +*> RESULT(1) = | A - Q * R | / (eps * m * |A|) +*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) +*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) +*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) +*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) +*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date November 2019 +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2019 +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX*16, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + DOUBLE PRECISION, ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX*16 WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, ZLANGE, ZLANSY + EXTERNAL DLAMCH, ZLANGE, ZLANSY +* .. +* .. External Subroutines .. + EXTERNAL ZLACPY, ZLARNV, ZLASET, ZLATSQR, ZUNHR_COL, + $ ZUNGTSQR, ZSCAL, ZGEMM, ZGEMQRT, ZHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL ZLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in ZLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* ZLATSQR requires NB1 to be bounded by N. +* + NB1_UB = MIN( NB1, N) +* +* ZGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* + CALL ZLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, + $ WORKQUERY, -1, INFO ) + LWORK = INT( WORKQUERY( 1 ) ) + CALL ZUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1, + $ INFO ) + + LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) ) +* +* In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'ZLATSQR' + CALL ZLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Copy the factor R into the array R. +* + SRNAMT = 'ZLACPY' + CALL ZLACPY( 'U', M, N, AF, M, R, M ) +* +* Reconstruct the orthogonal matrix Q. +* + SRNAMT = 'ZUNGTSQR' + CALL ZUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK, + $ INFO ) +* +* Perform the Householder reconstruction, the result is stored +* the arrays AF and T2. +* + SRNAMT = 'ZUNHR_COL' + CALL ZUNHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO ) +* +* Compute the factor R_hr corresponding to the Householder +* reconstructed Q_hr and place it in the upper triangle of AF to +* match the Q storage format in ZGEQRT. R_hr = R_tsqr * S, +* this means changing the sign of I-th row of the matrix R_tsqr +* according to sign of of I-th diagonal element DIAG(I) of the +* matrix S. +* + SRNAMT = 'ZLACPY' + CALL ZLACPY( 'U', M, N, R, M, AF, M ) +* + DO I = 1, N + IF( DIAG( I ).EQ.-CONE ) THEN + CALL ZSCAL( N+1-I, -CONE, AF( I, I ), M ) + END IF + END DO +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL ZLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**H)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = ZLANGE( '1', M, N, A, M, RWORK ) + RESID = ZLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**H)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = ZLANGE( '1', M, N, C, M, RWORK ) + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**H)*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**H)*C| / ( eps * m * |C|) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL ZLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = ZLANGE( '1', N, M, D, N, RWORK ) + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**H)| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of ZUNHR_COL01 +* + END diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index a1d784fa5..87432fd04 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -1,5 +1,3 @@ -include ../../make.inc - ####################################################################### # This is the makefile to create a library of the test matrix # generators used in LAPACK. The files are organized as follows: @@ -32,6 +30,9 @@ include ../../make.inc # ####################################################################### +TOPSRCDIR = ../.. +include $(TOPSRCDIR)/make.inc + SCATGEN = slatm1.o slatm7.o slaran.o slarnd.o SMATGEN = slatms.o slatme.o slatmr.o slatmt.o \ @@ -52,32 +53,32 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ zlagge.o zlaghe.o zlagsy.o zlakf2.o zlarge.o zlaror.o zlarot.o \ zlatm1.o zlarnd.o zlatm2.o zlatm3.o zlatm5.o zlatm6.o zlahilb.o -all: ../../$(TMGLIB) +.PHONY: all +all: $(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) -.PHONY: ../../$(TMGLIB) - -../../$(TMGLIB): $(ALLOBJ) - $(ARCH) $(ARCHFLAGS) $@ $^ +$(TMGLIB): $(ALLOBJ) + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ +.PHONY: single complex double complex16 single: $(SMATGEN) $(SCATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ - $(RANLIB) ../../$(TMGLIB) + $(AR) $(ARFLAGS) $(TMGLIB) $^ + $(RANLIB) $(TMGLIB) complex: $(CMATGEN) $(SCATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ - $(RANLIB) ../../$(TMGLIB) + $(AR) $(ARFLAGS) $(TMGLIB) $^ + $(RANLIB) $(TMGLIB) double: $(DMATGEN) $(DZATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ - $(RANLIB) ../../$(TMGLIB) + $(AR) $(ARFLAGS) $(TMGLIB) $^ + $(RANLIB) $(TMGLIB) complex16: $(ZMATGEN) $(DZATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ - $(RANLIB) ../../$(TMGLIB) + $(AR) $(ARFLAGS) $(TMGLIB) $^ + $(RANLIB) $(TMGLIB) $(SCATGEN): $(FRC) $(SMATGEN): $(FRC) @@ -89,14 +90,12 @@ $(ZMATGEN): $(FRC) FRC: @FRC=$(FRC) -clean: cleanobj #cleanlib +.PHONY: clean cleanobj cleanlib +clean: cleanobj cleanlib cleanobj: rm -f *.o cleanlib: - rm -f ../../$(TMGLIB) - -.f.o: - $(FORTRAN) $(OPTS) -c -o $@ $< + rm -f $(TMGLIB) -slaran.o: slaran.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< -dlaran.o: dlaran.f ; $(FORTRAN) $(NOOPT) -c -o $@ $< +slaran.o: slaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +dlaran.o: dlaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< diff --git a/lapack-netlib/TESTING/MATGEN/clahilb.f b/lapack-netlib/TESTING/MATGEN/clahilb.f index 13902872c..f4481fc78 100644 --- a/lapack-netlib/TESTING/MATGEN/clahilb.f +++ b/lapack-netlib/TESTING/MATGEN/clahilb.f @@ -164,7 +164,7 @@ INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8) * -* d's are generated from random permuation of those eight elements. +* d's are generated from random permutation of those eight elements. COMPLEX D1(8), D2(8), INVD1(8), INVD2(8) DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/ DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/ diff --git a/lapack-netlib/TESTING/MATGEN/clatm2.f b/lapack-netlib/TESTING/MATGEN/clatm2.f index 01221e0cc..5bd6b9dc8 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm2.f +++ b/lapack-netlib/TESTING/MATGEN/clatm2.f @@ -186,7 +186,7 @@ *> SPARSE is REAL *> Value between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/clatm3.f b/lapack-netlib/TESTING/MATGEN/clatm3.f index 3e07f3ec0..42b453553 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm3.f +++ b/lapack-netlib/TESTING/MATGEN/clatm3.f @@ -202,7 +202,7 @@ *> \verbatim *> SPARSE is REAL between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/clatmr.f b/lapack-netlib/TESTING/MATGEN/clatmr.f index 11d29a3d0..e80c4a514 100644 --- a/lapack-netlib/TESTING/MATGEN/clatmr.f +++ b/lapack-netlib/TESTING/MATGEN/clatmr.f @@ -316,20 +316,6 @@ *> Not referenced if PIVTNG = 'N'. Not modified. *> \endverbatim *> -*> \param[in] SPARSE -*> \verbatim -*> SPARSE is REAL -*> On entry specifies the sparsity of the matrix if a sparse -*> matrix is to be generated. SPARSE should lie between -*> 0 and 1. To generate a sparse matrix, for each matrix entry -*> a uniform ( 0, 1 ) random number x is generated and -*> compared to SPARSE; if x is larger the matrix entry -*> is unchanged and if x is smaller the entry is set -*> to zero. Thus on the average a fraction SPARSE of the -*> entries will be set to zero. -*> Not modified. -*> \endverbatim -*> *> \param[in] KL *> \verbatim *> KL is INTEGER @@ -350,6 +336,20 @@ *> Not modified. *> \endverbatim *> +*> \param[in] SPARSE +*> \verbatim +*> SPARSE is REAL +*> On entry specifies the sparsity of the matrix if a sparse +*> matrix is to be generated. SPARSE should lie between +*> 0 and 1. To generate a sparse matrix, for each matrix entry +*> a uniform ( 0, 1 ) random number x is generated and +*> compared to SPARSE; if x is larger the matrix entry +*> is unchanged and if x is smaller the entry is set +*> to zero. Thus on the average a fraction SPARSE of the +*> entries will be set to zero. +*> Not modified. +*> \endverbatim +*> *> \param[in] ANORM *> \verbatim *> ANORM is REAL @@ -416,7 +416,7 @@ *> If PACK='C' or 'R', LDA must be at least 1. *> If PACK='B', or 'Q', LDA must be MIN ( KU+1, N ) *> If PACK='Z', LDA must be at least KUU+KLL+1, where -*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 ) +*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 ) *> Not modified. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/MATGEN/dlatm2.f b/lapack-netlib/TESTING/MATGEN/dlatm2.f index 446f5a801..d7a6d19f3 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm2.f +++ b/lapack-netlib/TESTING/MATGEN/dlatm2.f @@ -182,7 +182,7 @@ *> \verbatim *> SPARSE is DOUBLE PRECISION between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/dlatm3.f b/lapack-netlib/TESTING/MATGEN/dlatm3.f index cf6da10f8..15f5ac080 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm3.f +++ b/lapack-netlib/TESTING/MATGEN/dlatm3.f @@ -199,7 +199,7 @@ *> \verbatim *> SPARSE is DOUBLE PRECISION between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/dlatmr.f b/lapack-netlib/TESTING/MATGEN/dlatmr.f index e7ea41907..a914481f7 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatmr.f +++ b/lapack-netlib/TESTING/MATGEN/dlatmr.f @@ -303,20 +303,6 @@ *> Not referenced if PIVTNG = 'N'. Not modified. *> \endverbatim *> -*> \param[in] SPARSE -*> \verbatim -*> SPARSE is DOUBLE PRECISION -*> On entry specifies the sparsity of the matrix if a sparse -*> matrix is to be generated. SPARSE should lie between -*> 0 and 1. To generate a sparse matrix, for each matrix entry -*> a uniform ( 0, 1 ) random number x is generated and -*> compared to SPARSE; if x is larger the matrix entry -*> is unchanged and if x is smaller the entry is set -*> to zero. Thus on the average a fraction SPARSE of the -*> entries will be set to zero. -*> Not modified. -*> \endverbatim -*> *> \param[in] KL *> \verbatim *> KL is INTEGER @@ -337,6 +323,20 @@ *> Not modified. *> \endverbatim *> +*> \param[in] SPARSE +*> \verbatim +*> SPARSE is DOUBLE PRECISION +*> On entry specifies the sparsity of the matrix if a sparse +*> matrix is to be generated. SPARSE should lie between +*> 0 and 1. To generate a sparse matrix, for each matrix entry +*> a uniform ( 0, 1 ) random number x is generated and +*> compared to SPARSE; if x is larger the matrix entry +*> is unchanged and if x is smaller the entry is set +*> to zero. Thus on the average a fraction SPARSE of the +*> entries will be set to zero. +*> Not modified. +*> \endverbatim +*> *> \param[in] ANORM *> \verbatim *> ANORM is DOUBLE PRECISION @@ -398,7 +398,7 @@ *> If PACK='C' or 'R', LDA must be at least 1. *> If PACK='B', or 'Q', LDA must be MIN ( KU+1, N ) *> If PACK='Z', LDA must be at least KUU+KLL+1, where -*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 ) +*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 ) *> Not modified. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/MATGEN/slatm2.f b/lapack-netlib/TESTING/MATGEN/slatm2.f index fc7e78126..2473f1f44 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm2.f +++ b/lapack-netlib/TESTING/MATGEN/slatm2.f @@ -182,7 +182,7 @@ *> \verbatim *> SPARSE is REAL between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/slatm3.f b/lapack-netlib/TESTING/MATGEN/slatm3.f index e61c954bd..18c2c07d5 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm3.f +++ b/lapack-netlib/TESTING/MATGEN/slatm3.f @@ -199,7 +199,7 @@ *> \verbatim *> SPARSE is REAL between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/slatmr.f b/lapack-netlib/TESTING/MATGEN/slatmr.f index e4705994a..c2cedd21c 100644 --- a/lapack-netlib/TESTING/MATGEN/slatmr.f +++ b/lapack-netlib/TESTING/MATGEN/slatmr.f @@ -303,20 +303,6 @@ *> Not referenced if PIVTNG = 'N'. Not modified. *> \endverbatim *> -*> \param[in] SPARSE -*> \verbatim -*> SPARSE is REAL -*> On entry specifies the sparsity of the matrix if a sparse -*> matrix is to be generated. SPARSE should lie between -*> 0 and 1. To generate a sparse matrix, for each matrix entry -*> a uniform ( 0, 1 ) random number x is generated and -*> compared to SPARSE; if x is larger the matrix entry -*> is unchanged and if x is smaller the entry is set -*> to zero. Thus on the average a fraction SPARSE of the -*> entries will be set to zero. -*> Not modified. -*> \endverbatim -*> *> \param[in] KL *> \verbatim *> KL is INTEGER @@ -337,6 +323,20 @@ *> Not modified. *> \endverbatim *> +*> \param[in] SPARSE +*> \verbatim +*> SPARSE is REAL +*> On entry specifies the sparsity of the matrix if a sparse +*> matrix is to be generated. SPARSE should lie between +*> 0 and 1. To generate a sparse matrix, for each matrix entry +*> a uniform ( 0, 1 ) random number x is generated and +*> compared to SPARSE; if x is larger the matrix entry +*> is unchanged and if x is smaller the entry is set +*> to zero. Thus on the average a fraction SPARSE of the +*> entries will be set to zero. +*> Not modified. +*> \endverbatim +*> *> \param[in] ANORM *> \verbatim *> ANORM is REAL @@ -398,7 +398,7 @@ *> If PACK='C' or 'R', LDA must be at least 1. *> If PACK='B', or 'Q', LDA must be MIN ( KU+1, N ) *> If PACK='Z', LDA must be at least KUU+KLL+1, where -*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 ) +*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 ) *> Not modified. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/MATGEN/zlahilb.f b/lapack-netlib/TESTING/MATGEN/zlahilb.f index 43057931d..e5a317821 100644 --- a/lapack-netlib/TESTING/MATGEN/zlahilb.f +++ b/lapack-netlib/TESTING/MATGEN/zlahilb.f @@ -164,7 +164,7 @@ INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8) * -* d's are generated from random permuation of those eight elements. +* d's are generated from random permutation of those eight elements. COMPLEX*16 d1(8), d2(8), invd1(8), invd2(8) DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/ DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm2.f b/lapack-netlib/TESTING/MATGEN/zlatm2.f index 2de69eeca..ea93431e7 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm2.f +++ b/lapack-netlib/TESTING/MATGEN/zlatm2.f @@ -185,7 +185,7 @@ *> \verbatim *> SPARSE is DOUBLE PRECISION between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/zlatm3.f b/lapack-netlib/TESTING/MATGEN/zlatm3.f index 42d58c853..25d6233f3 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm3.f +++ b/lapack-netlib/TESTING/MATGEN/zlatm3.f @@ -202,7 +202,7 @@ *> \verbatim *> SPARSE is DOUBLE PRECISION between 0. and 1. *> On entry specifies the sparsity of the matrix -*> if sparse matix is to be generated. +*> if sparse matrix is to be generated. *> SPARSE should lie between 0 and 1. *> A uniform ( 0, 1 ) random number x is generated and *> compared to SPARSE; if x is larger the matrix entry diff --git a/lapack-netlib/TESTING/MATGEN/zlatmr.f b/lapack-netlib/TESTING/MATGEN/zlatmr.f index 6685a3570..56285e1f4 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatmr.f +++ b/lapack-netlib/TESTING/MATGEN/zlatmr.f @@ -316,20 +316,6 @@ *> Not referenced if PIVTNG = 'N'. Not modified. *> \endverbatim *> -*> \param[in] SPARSE -*> \verbatim -*> SPARSE is DOUBLE PRECISION -*> On entry specifies the sparsity of the matrix if a sparse -*> matrix is to be generated. SPARSE should lie between -*> 0 and 1. To generate a sparse matrix, for each matrix entry -*> a uniform ( 0, 1 ) random number x is generated and -*> compared to SPARSE; if x is larger the matrix entry -*> is unchanged and if x is smaller the entry is set -*> to zero. Thus on the average a fraction SPARSE of the -*> entries will be set to zero. -*> Not modified. -*> \endverbatim -*> *> \param[in] KL *> \verbatim *> KL is INTEGER @@ -350,6 +336,20 @@ *> Not modified. *> \endverbatim *> +*> \param[in] SPARSE +*> \verbatim +*> SPARSE is DOUBLE PRECISION +*> On entry specifies the sparsity of the matrix if a sparse +*> matrix is to be generated. SPARSE should lie between +*> 0 and 1. To generate a sparse matrix, for each matrix entry +*> a uniform ( 0, 1 ) random number x is generated and +*> compared to SPARSE; if x is larger the matrix entry +*> is unchanged and if x is smaller the entry is set +*> to zero. Thus on the average a fraction SPARSE of the +*> entries will be set to zero. +*> Not modified. +*> \endverbatim +*> *> \param[in] ANORM *> \verbatim *> ANORM is DOUBLE PRECISION @@ -416,7 +416,7 @@ *> If PACK='C' or 'R', LDA must be at least 1. *> If PACK='B', or 'Q', LDA must be MIN ( KU+1, N ) *> If PACK='Z', LDA must be at least KUU+KLL+1, where -*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 ) +*> KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 ) *> Not modified. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile index 8b883c0fa..bdea2bfaa 100644 --- a/lapack-netlib/TESTING/Makefile +++ b/lapack-netlib/TESTING/Makefile @@ -34,8 +34,10 @@ # ####################################################################### -include ../make.inc +TOPSRCDIR = .. +include $(TOPSRCDIR)/make.inc +.PHONY: all all: single complex double complex16 singleproto doubleproto complexproto complex16proto SEIGTST= snep.out \ @@ -139,10 +141,13 @@ ZLINTST= ztest.out ZLINTSTPROTO= zctest.out ztest_rfp.out +.PHONY: single complex double complex16 single: $(SLINTST) $(SEIGTST) complex: $(CLINTST) $(CEIGTST) double: $(DLINTST) $(DEIGTST) complex16: $(ZLINTST) $(ZEIGTST) + +.PHONY: singleproto complexproto doubleproto complex16proto singleproto: $(SLINTSTPROTO) complexproto: $(CLINTSTPROTO) doubleproto: $(DLINTSTPROTO) @@ -153,61 +158,61 @@ complex16proto: $(ZLINTSTPROTO) stest.out: stest.in LIN/xlintsts @echo Testing REAL LAPACK linear equation routines - ./LIN/xlintsts < $< > $@ 2>&1 + ./LIN/xlintsts < stest.in > $@ 2>&1 # # ======== COMPLEX LIN TESTS ========================== ctest.out: ctest.in LIN/xlintstc @echo Testing COMPLEX LAPACK linear equation routines - ./LIN/xlintstc < $< > $@ 2>&1 + ./LIN/xlintstc < ctest.in > $@ 2>&1 # # ======== DOUBLE LIN TESTS =========================== dtest.out: dtest.in LIN/xlintstd @echo Testing DOUBLE PRECISION LAPACK linear equation routines - ./LIN/xlintstd < $< > $@ 2>&1 + ./LIN/xlintstd < dtest.in > $@ 2>&1 # # ======== COMPLEX16 LIN TESTS ======================== ztest.out: ztest.in LIN/xlintstz @echo Testing COMPLEX16 LAPACK linear equation routines - ./LIN/xlintstz < $< > $@ 2>&1 + ./LIN/xlintstz < ztest.in > $@ 2>&1 # # ======== SINGLE-DOUBLE PROTO LIN TESTS ============== dstest.out: dstest.in LIN/xlintstds @echo Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines - ./LIN/xlintstds < $< > $@ 2>&1 + ./LIN/xlintstds < dstest.in > $@ 2>&1 # # ======== COMPLEX-COMPLEX16 LIN TESTS ======================== zctest.out: zctest.in LIN/xlintstzc @echo Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines - ./LIN/xlintstzc < $< > $@ 2>&1 + ./LIN/xlintstzc < zctest.in > $@ 2>&1 # # ======== SINGLE RFP LIN TESTS ======================== stest_rfp.out: stest_rfp.in LIN/xlintstrfs @echo Testing REAL LAPACK RFP prototype linear equation routines - ./LIN/xlintstrfs < $< > $@ 2>&1 + ./LIN/xlintstrfs < stest_rfp.in > $@ 2>&1 # # ======== COMPLEX16 RFP LIN TESTS ======================== dtest_rfp.out: dtest_rfp.in LIN/xlintstrfd @echo Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines - ./LIN/xlintstrfd < $< > $@ 2>&1 + ./LIN/xlintstrfd < dtest_rfp.in > $@ 2>&1 # # ======== COMPLEX16 RFP LIN TESTS ======================== ctest_rfp.out: ctest_rfp.in LIN/xlintstrfc @echo Testing COMPLEX LAPACK RFP prototype linear equation routines - ./LIN/xlintstrfc < $< > $@ 2>&1 + ./LIN/xlintstrfc < ctest_rfp.in > $@ 2>&1 # # ======== COMPLEX16 RFP LIN TESTS ======================== ztest_rfp.out: ztest_rfp.in LIN/xlintstrfz @echo Testing COMPLEX16 LAPACK RFP prototype linear equation routines - ./LIN/xlintstrfz < $< > $@ 2>&1 + ./LIN/xlintstrfz < ztest_rfp.in > $@ 2>&1 # # # ======== SINGLE EIG TESTS =========================== @@ -215,329 +220,329 @@ ztest_rfp.out: ztest_rfp.in LIN/xlintstrfz snep.out: nep.in EIG/xeigtsts @echo NEP: Testing Nonsymmetric Eigenvalue Problem routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < nep.in > $@ 2>&1 ssep.out: sep.in EIG/xeigtsts @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sep.in > $@ 2>&1 sse2.out: se2.in EIG/xeigtsts @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < se2.in > $@ 2>&1 ssvd.out: svd.in EIG/xeigtsts @echo SVD: Testing Singular Value Decomposition routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < svd.in > $@ 2>&1 sec.out: sec.in EIG/xeigtsts @echo SEC: Testing REAL Eigen Condition Routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sec.in > $@ 2>&1 sed.out: sed.in EIG/xeigtsts @echo SEV: Testing REAL Nonsymmetric Eigenvalue Driver - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sed.in > $@ 2>&1 sgg.out: sgg.in EIG/xeigtsts @echo SGG: Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sgg.in > $@ 2>&1 sgd.out: sgd.in EIG/xeigtsts @echo SGD: Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sgd.in > $@ 2>&1 ssb.out: ssb.in EIG/xeigtsts @echo SSB: Testing REAL Symmetric Eigenvalue Problem routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < ssb.in > $@ 2>&1 ssg.out: ssg.in EIG/xeigtsts @echo SSG: Testing REAL Symmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < ssg.in > $@ 2>&1 sbal.out: sbal.in EIG/xeigtsts @echo SGEBAL: Testing the balancing of a REAL general matrix - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sbal.in > $@ 2>&1 sbak.out: sbak.in EIG/xeigtsts @echo SGEBAK: Testing the back transformation of a REAL balanced matrix - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sbak.in > $@ 2>&1 sgbal.out: sgbal.in EIG/xeigtsts @echo SGGBAL: Testing the balancing of a pair of REAL general matrices - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sgbal.in > $@ 2>&1 sgbak.out: sgbak.in EIG/xeigtsts @echo SGGBAK: Testing the back transformation of a pair of REAL balanced matrices - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sgbak.in > $@ 2>&1 sbb.out: sbb.in EIG/xeigtsts @echo SBB: Testing banded Singular Value Decomposition routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < sbb.in > $@ 2>&1 sglm.out: glm.in EIG/xeigtsts @echo GLM: Testing Generalized Linear Regression Model routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < glm.in > $@ 2>&1 sgqr.out: gqr.in EIG/xeigtsts @echo GQR: Testing Generalized QR and RQ factorization routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < gqr.in > $@ 2>&1 sgsv.out: gsv.in EIG/xeigtsts @echo GSV: Testing Generalized Singular Value Decomposition routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < gsv.in > $@ 2>&1 scsd.out: csd.in EIG/xeigtsts @echo CSD: Testing CS Decomposition routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < csd.in > $@ 2>&1 slse.out: lse.in EIG/xeigtsts @echo LSE: Testing Constrained Linear Least Squares routines - ./EIG/xeigtsts < $< > $@ 2>&1 + ./EIG/xeigtsts < lse.in > $@ 2>&1 # # ======== COMPLEX EIG TESTS =========================== cnep.out: nep.in EIG/xeigtstc @echo NEP: Testing Nonsymmetric Eigenvalue Problem routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < nep.in > $@ 2>&1 csep.out: sep.in EIG/xeigtstc @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < sep.in > $@ 2>&1 cse2.out: se2.in EIG/xeigtstc @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < se2.in > $@ 2>&1 csvd.out: svd.in EIG/xeigtstc @echo SVD: Testing Singular Value Decomposition routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < svd.in > $@ 2>&1 cec.out: cec.in EIG/xeigtstc @echo CEC: Testing COMPLEX Eigen Condition Routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cec.in > $@ 2>&1 ced.out: ced.in EIG/xeigtstc @echo CES: Testing COMPLEX Nonsymmetric Schur Form Driver - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < ced.in > $@ 2>&1 cgg.out: cgg.in EIG/xeigtstc @echo CGG: Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cgg.in > $@ 2>&1 cgd.out: cgd.in EIG/xeigtstc @echo CGD: Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cgd.in > $@ 2>&1 csb.out: csb.in EIG/xeigtstc @echo CHB: Testing Hermitian Eigenvalue Problem routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < csb.in > $@ 2>&1 csg.out: csg.in EIG/xeigtstc @echo CSG: Testing Symmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < csg.in > $@ 2>&1 cbal.out: cbal.in EIG/xeigtstc @echo CGEBAL: Testing the balancing of a COMPLEX general matrix - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cbal.in > $@ 2>&1 cbak.out: cbak.in EIG/xeigtstc @echo CGEBAK: Testing the back transformation of a COMPLEX balanced matrix - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cbak.in > $@ 2>&1 cgbal.out: cgbal.in EIG/xeigtstc @echo CGGBAL: Testing the balancing of a pair of COMPLEX general matrices - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cgbal.in > $@ 2>&1 cgbak.out: cgbak.in EIG/xeigtstc @echo CGGBAK: Testing the back transformation of a pair of COMPLEX balanced matrices - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cgbak.in > $@ 2>&1 cbb.out: cbb.in EIG/xeigtstc @echo CBB: Testing banded Singular Value Decomposition routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < cbb.in > $@ 2>&1 cglm.out: glm.in EIG/xeigtstc @echo GLM: Testing Generalized Linear Regression Model routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < glm.in > $@ 2>&1 cgqr.out: gqr.in EIG/xeigtstc @echo GQR: Testing Generalized QR and RQ factorization routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < gqr.in > $@ 2>&1 cgsv.out: gsv.in EIG/xeigtstc @echo GSV: Testing Generalized Singular Value Decomposition routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < gsv.in > $@ 2>&1 ccsd.out: csd.in EIG/xeigtstc @echo CSD: Testing CS Decomposition routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < csd.in > $@ 2>&1 clse.out: lse.in EIG/xeigtstc @echo LSE: Testing Constrained Linear Least Squares routines - ./EIG/xeigtstc < $< > $@ 2>&1 + ./EIG/xeigtstc < lse.in > $@ 2>&1 # # ======== DOUBLE EIG TESTS =========================== dnep.out: nep.in EIG/xeigtstd @echo NEP: Testing Nonsymmetric Eigenvalue Problem routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < nep.in > $@ 2>&1 dsep.out: sep.in EIG/xeigtstd @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < sep.in > $@ 2>&1 dse2.out: se2.in EIG/xeigtstd @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < se2.in > $@ 2>&1 dsvd.out: svd.in EIG/xeigtstd @echo SVD: Testing Singular Value Decomposition routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < svd.in > $@ 2>&1 dec.out: dec.in EIG/xeigtstd @echo DEC: Testing DOUBLE PRECISION Eigen Condition Routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dec.in > $@ 2>&1 ded.out: ded.in EIG/xeigtstd @echo DEV: Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < ded.in > $@ 2>&1 dgg.out: dgg.in EIG/xeigtstd @echo DGG: Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dgg.in > $@ 2>&1 dgd.out: dgd.in EIG/xeigtstd @echo DGD: Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dgd.in > $@ 2>&1 dsb.out: dsb.in EIG/xeigtstd @echo DSB: Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dsb.in > $@ 2>&1 dsg.out: dsg.in EIG/xeigtstd @echo DSG: Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dsg.in > $@ 2>&1 dbal.out: dbal.in EIG/xeigtstd @echo DGEBAL: Testing the balancing of a DOUBLE PRECISION general matrix - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dbal.in > $@ 2>&1 dbak.out: dbak.in EIG/xeigtstd @echo DGEBAK: Testing the back transformation of a DOUBLE PRECISION balanced matrix - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dbak.in > $@ 2>&1 dgbal.out: dgbal.in EIG/xeigtstd @echo DGGBAL: Testing the balancing of a pair of DOUBLE PRECISION general matrices - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dgbal.in > $@ 2>&1 dgbak.out: dgbak.in EIG/xeigtstd @echo DGGBAK: Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dgbak.in > $@ 2>&1 dbb.out: dbb.in EIG/xeigtstd @echo DBB: Testing banded Singular Value Decomposition routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < dbb.in > $@ 2>&1 dglm.out: glm.in EIG/xeigtstd @echo GLM: Testing Generalized Linear Regression Model routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < glm.in > $@ 2>&1 dgqr.out: gqr.in EIG/xeigtstd @echo GQR: Testing Generalized QR and RQ factorization routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < gqr.in > $@ 2>&1 dgsv.out: gsv.in EIG/xeigtstd @echo GSV: Testing Generalized Singular Value Decomposition routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < gsv.in > $@ 2>&1 dcsd.out: csd.in EIG/xeigtstd @echo CSD: Testing CS Decomposition routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < csd.in > $@ 2>&1 dlse.out: lse.in EIG/xeigtstd @echo LSE: Testing Constrained Linear Least Squares routines - ./EIG/xeigtstd < $< > $@ 2>&1 + ./EIG/xeigtstd < lse.in > $@ 2>&1 # # ======== COMPLEX16 EIG TESTS =========================== znep.out: nep.in EIG/xeigtstz @echo NEP: Testing Nonsymmetric Eigenvalue Problem routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < nep.in > $@ 2>&1 zsep.out: sep.in EIG/xeigtstz @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < sep.in > $@ 2>&1 zse2.out: se2.in EIG/xeigtstz @echo SEP: Testing Symmetric Eigenvalue Problem routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < se2.in > $@ 2>&1 zsvd.out: svd.in EIG/xeigtstz @echo SVD: Testing Singular Value Decomposition routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < svd.in > $@ 2>&1 zec.out: zec.in EIG/xeigtstz @echo ZEC: Testing COMPLEX16 Eigen Condition Routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zec.in > $@ 2>&1 zed.out: zed.in EIG/xeigtstz @echo ZES: Testing COMPLEX16 Nonsymmetric Schur Form Driver - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zed.in > $@ 2>&1 zgg.out: zgg.in EIG/xeigtstz @echo ZGG: Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zgg.in > $@ 2>&1 zgd.out: zgd.in EIG/xeigtstz @echo ZGD: Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zgd.in > $@ 2>&1 zsb.out: zsb.in EIG/xeigtstz @echo ZHB: Testing Hermitian Eigenvalue Problem routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zsb.in > $@ 2>&1 zsg.out: zsg.in EIG/xeigtstz @echo ZSG: Testing Symmetric Generalized Eigenvalue Problem routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zsg.in > $@ 2>&1 zbal.out: zbal.in EIG/xeigtstz @echo ZGEBAL: Testing the balancing of a COMPLEX16 general matrix - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zbal.in > $@ 2>&1 zbak.out: zbak.in EIG/xeigtstz @echo ZGEBAK: Testing the back transformation of a COMPLEX16 balanced matrix - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zbak.in > $@ 2>&1 zgbal.out: zgbal.in EIG/xeigtstz @echo ZGGBAL: Testing the balancing of a pair of COMPLEX general matrices - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zgbal.in > $@ 2>&1 zgbak.out: zgbak.in EIG/xeigtstz @echo ZGGBAK: Testing the back transformation of a pair of COMPLEX16 balanced matrices - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zgbak.in > $@ 2>&1 zbb.out: zbb.in EIG/xeigtstz @echo ZBB: Testing banded Singular Value Decomposition routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < zbb.in > $@ 2>&1 zglm.out: glm.in EIG/xeigtstz @echo GLM: Testing Generalized Linear Regression Model routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < glm.in > $@ 2>&1 zgqr.out: gqr.in EIG/xeigtstz @echo GQR: Testing Generalized QR and RQ factorization routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < gqr.in > $@ 2>&1 zgsv.out: gsv.in EIG/xeigtstz @echo GSV: Testing Generalized Singular Value Decomposition routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < gsv.in > $@ 2>&1 zcsd.out: csd.in EIG/xeigtstz @echo CSD: Testing CS Decomposition routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < csd.in > $@ 2>&1 zlse.out: lse.in EIG/xeigtstz @echo LSE: Testing Constrained Linear Least Squares routines - ./EIG/xeigtstz < $< > $@ 2>&1 + ./EIG/xeigtstz < lse.in > $@ 2>&1 # ============================================================================== LIN/xlintsts: $(FRCLIN) $(FRC) @@ -582,6 +587,7 @@ EIG/xeigtstd: $(FRCEIG) $(FRC) EIG/xeigtstz: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstz +.PHONY: clean cleantest clean: cleantest cleantest: rm -f *.out core diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in index 2f3853a03..a3588b4a1 100644 --- a/lapack-netlib/TESTING/ctest.in +++ b/lapack-netlib/TESTING/ctest.in @@ -50,3 +50,4 @@ CQX CXQ CTQ CTS +CHH diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in index a7a16ee41..29bb8b92e 100644 --- a/lapack-netlib/TESTING/dtest.in +++ b/lapack-netlib/TESTING/dtest.in @@ -44,3 +44,4 @@ DQX DXQ DTQ DTS +DHH diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in index d32047047..27ac30040 100644 --- a/lapack-netlib/TESTING/stest.in +++ b/lapack-netlib/TESTING/stest.in @@ -44,3 +44,4 @@ SQX SXQ STQ STS +SHH diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in index 520253941..58da33d60 100644 --- a/lapack-netlib/TESTING/ztest.in +++ b/lapack-netlib/TESTING/ztest.in @@ -50,3 +50,4 @@ ZQX ZXQ ZTQ ZTS +ZHH diff --git a/lapack-netlib/appveyor.yml b/lapack-netlib/appveyor.yml deleted file mode 100644 index 7fc3fbdd7..000000000 --- a/lapack-netlib/appveyor.yml +++ /dev/null @@ -1,64 +0,0 @@ -# Windows testing. -# Syntax for this file: -# http://www.appveyor.com/docs/appveyor-yml - -shallow_clone: true - -platform: x64 - -cache: - - x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z - - i686-4.9.2-release-win32-dwarf-rt_v4-rev4.7z - -environment: - CTEST_OUTPUT_ON_FAILURE: 1 - matrix: - - MINGW_DIR: mingw64 - MINGW_URL: https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win64/Personal%20Builds/mingw-builds/4.9.2/threads-win32/seh/x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z/download - MINGW_ARCHIVE: x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z - - MINGW_DIR: mingw32 - MINGW_URL: https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/4.9.2/threads-win32/dwarf/i686-4.9.2-release-win32-dwarf-rt_v4-rev4.7z/download - MINGW_ARCHIVE: i686-4.9.2-release-win32-dwarf-rt_v4-rev4.7z - -install: - - if not exist "%MINGW_ARCHIVE%" appveyor DownloadFile "%MINGW_URL%" -FileName "%MINGW_ARCHIVE%" - - 7z x -y "%MINGW_ARCHIVE%" > nul - # CMake refuses to generate MinGW Makefiles if sh.exe is in the Path - - ps: Get-Command sh.exe -All | Remove-Item - -build_script: - - echo "NUMBER_OF_PROCESSORS=%NUMBER_OF_PROCESSORS%" - - set PATH=%CD%\%MINGW_DIR%\bin;%PATH% - - g++ --version - - mingw32-make --version - - cmake --version - - if "%APPVEYOR_REPO_TAG%"=="true" (set CMAKE_BUILD_TYPE=Release) else (set CMAKE_BUILD_TYPE=Debug) - - set SRC_DIR=%CD% - - echo %SRC_DIR% - - set BLD_DIR=%SRC_DIR%\..\lapack-appveyor-bld - - set INST_DIR=%SRC_DIR%\..\lapack-appveyor-install - - mkdir -p %BLD_DIR% - - cd %BLD_DIR% - # See issue #17 on github dashboard. Once resolved, use -DCBLAS=ON - # - cmake -DCMAKE_INSTALL_PREFIX=${INST_DIR} -DLAPACKE=ON ${SRC_DIR} - - cmake - -G "MinGW Makefiles" - -DBUILDNAME:STRING="appveyor-%MINGW_DIR%-%APPVEYOR_REPO_BRANCH%" - -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% - -DCMAKE_INSTALL_PREFIX=%INST_DIR% - -DCBLAS:BOOL=ON - -DLAPACKE:BOOL=ON - -DBUILD_TESTING=ON - -DLAPACKE_WITH_TMG:BOOL=ON - %SRC_DIR% - - mingw32-make -j%NUMBER_OF_PROCESSORS% - -test_script: - - ctest -D ExperimentalStart - - ctest -D ExperimentalConfigure - - ctest -D ExperimentalBuild -j%NUMBER_OF_PROCESSORS% - - ctest -D ExperimentalTest --schedule-random -j%NUMBER_OF_PROCESSORS% --output-on-failure --timeout 100 -E "CBLAS\-.*cblat1" - - ctest -D ExperimentalSubmit - -after_test: - - mingw32-make install -j%NUMBER_OF_PROCESSORS% diff --git a/lapack-netlib/lapack_build.cmake b/lapack-netlib/lapack_build.cmake index 68744cc4c..39878cb24 100644 --- a/lapack-netlib/lapack_build.cmake +++ b/lapack-netlib/lapack_build.cmake @@ -69,7 +69,8 @@ find_program(HOSTNAME NAMES hostname) find_program(UNAME NAMES uname) # Get the build name and hostname -exec_program(${HOSTNAME} ARGS OUTPUT_VARIABLE hostname) +execute_process(${HOSTNAME} + OUTPUT_VARIABLE hostname) string(REGEX REPLACE "[/\\\\+<> #]" "-" hostname "${hostname}") message("HOSTNAME: ${hostname}") @@ -83,7 +84,8 @@ find_package(Git REQUIRED) set(CTEST_GIT_COMMAND ${GIT_EXECUTABLE}) set(CTEST_UPDATE_COMMAND ${GIT_EXECUTABLE}) macro(getuname name flag) - exec_program("${UNAME}" ARGS "${flag}" OUTPUT_VARIABLE "${name}") + execute_process(COMMAND "${UNAME}" "${flag}" + OUTPUT_VARIABLE "${name}") string(REGEX REPLACE "[/\\\\+<> #]" "-" "${name}" "${${name}}") string(REGEX REPLACE "^(......|.....|....|...|..|.).*" "\\1" "${name}" "${${name}}") endmacro() @@ -167,7 +169,7 @@ endif() # dashboard then set this variable to the directory # the dashboard should be in make_directory("${CTEST_DASHBOARD_ROOT}") -# these are the the name of the source and binary directory on disk. +# these are the names of the source and binary directory on disk. # They will be appended to DASHBOARD_ROOT set(CTEST_SOURCE_DIRECTORY "${CTEST_DASHBOARD_ROOT}/${CTEST_DIR_NAME}") set(CTEST_BINARY_DIRECTORY "${CTEST_SOURCE_DIRECTORY}-${CTEST_BUILD_NAME}") diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 5d07e1e87..5582744a0 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -12,8 +12,8 @@ import os, sys, math import getopt # Arguments try: - opts, args = getopt.getopt(sys.argv[1:], "hd:srep:t:n", - ["help", "dir", "short", "run", "error","prec=","test=","number"]) + opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n", + ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"]) except getopt.error as msg: print(msg) @@ -29,14 +29,13 @@ only_numbers=0 test_dir='TESTING' bin_dir='bin/Release' -abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir)) - for o, a in opts: if o in ("-h", "--help"): print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]") print(" - h is to print this message") print(" - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests") print(" - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .") + print(" - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .") print(" LEVEL OF OUTPUT") print(" - x is to print a detailed summary") print(" - e is to print only the error summary") @@ -75,6 +74,8 @@ for o, a in opts: just_errors = 1 if o in ( '-p', '--prec' ): prec = a + if o in ( '-b', '--bin' ): + bin_dir = a if o in ( '-d', '--dir' ): test_dir = a if o in ( '-t', '--test' ): @@ -85,6 +86,8 @@ for o, a in opts: # process options +abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir)) + os.chdir(test_dir) execution=1 @@ -114,10 +117,7 @@ def run_summary_test( f, cmdline, short_summary): pipe = open(cmdline,'r') r=0 else: - if os.name != 'nt': - cmdline='./' + cmdline - else : - cmdline=abs_bin_dir+os.path.sep+cmdline + cmdline = os.path.join(abs_bin_dir, cmdline) outfile=cmdline.split()[4] #pipe = open(outfile,'w') diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example index d780c3a23..57fd51ebe 100644 --- a/lapack-netlib/make.inc.example +++ b/lapack-netlib/make.inc.example @@ -8,10 +8,10 @@ SHELL = /bin/sh # CC is the C compiler, normally invoked with options CFLAGS. # -CC = gcc +CC = gcc CFLAGS = -O3 -# Modify the FORTRAN and OPTS definitions to refer to the compiler +# Modify the FC and FFLAGS definitions to the desired compiler # and desired compiler options for your machine. NOOPT refers to # the compiler options desired when NO OPTIMIZATION is selected. # @@ -19,23 +19,21 @@ CFLAGS = -O3 # and handle these quantities appropriately. As a consequence, one # should not compile LAPACK with flags such as -ffpe-trap=overflow. # -FORTRAN = gfortran -OPTS = -O2 -frecursive -DRVOPTS = $(OPTS) -NOOPT = -O0 -frecursive +FC = gfortran +FFLAGS = -O2 -frecursive +FFLAGS_DRV = $(FFLAGS) +FFLAGS_NOOPT = -O0 -frecursive -# Define LOADER and LOADOPTS to refer to the loader and desired -# load options for your machine. +# Define LDFLAGS to the desired linker options for your machine. # -LOADER = gfortran -LOADOPTS = +LDFLAGS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr -RANLIB = ranlib +AR = ar +ARFLAGS = cr +RANLIB = ranlib # Timer for the SECOND and DSECND routines # @@ -78,8 +76,8 @@ TIMER = INT_ETIME # machine-specific, optimized BLAS library should be used whenever # possible.) # -BLASLIB = ../../librefblas.a -CBLASLIB = ../../libcblas.a -LAPACKLIB = liblapack.a -TMGLIB = libtmglib.a -LAPACKELIB = liblapacke.a +BLASLIB = $(TOPSRCDIR)/librefblas.a +CBLASLIB = $(TOPSRCDIR)/libcblas.a +LAPACKLIB = $(TOPSRCDIR)/liblapack.a +TMGLIB = $(TOPSRCDIR)/libtmglib.a +LAPACKELIB = $(TOPSRCDIR)/liblapacke.a diff --git a/lapack-netlib/meson.build b/lapack-netlib/meson.build new file mode 100644 index 000000000..b1e9c6bc1 --- /dev/null +++ b/lapack-netlib/meson.build @@ -0,0 +1,28 @@ +# cd build +# meson --buildtype release --prefix=$HOME/.local/lapack .. +# ninja +# ninja install + +project('LAPACK', 'fortran', + default_options : ['default_library=static', 'libdir=lib/'], + version : '3.8.0') + +subdir('BLAS/SRC') +subdir('SRC') + +prec = get_option('realkind') + + +if prec == 'd' + bsrc = DBLAS1 + DBLAS2 + DBLAS3 + lsrc = DZLAUX + DSLASRC +elif prec == 's' + bsrc = SBLAS1 + SBLAS2 + SBLAS3 + lsrc = SCLAUX + SLASRC +endif + +blas = library('blas', bsrc, + install : true) + +lapack = library('lapack', lsrc, ALLAUX, + install : true) diff --git a/lapack-netlib/meson_options.txt b/lapack-netlib/meson_options.txt new file mode 100644 index 000000000..b378e3329 --- /dev/null +++ b/lapack-netlib/meson_options.txt @@ -0,0 +1,3 @@ +option('realkind', type : 'string', value : 'd', + description : 's: real32 d: real64 c: complex32 z: complex64') + From 52de4cc8fdf976e0e09f81904e4f427b4dc64015 Mon Sep 17 00:00:00 2001 From: chenxuqiang Date: Wed, 1 Jan 2020 21:50:45 -0500 Subject: [PATCH 838/935] kernel/arm64/dgemm_beta.S: add beta == zero branch added beta == zero branch, and no need to load C matrix. Signed by: Xuqiang Chen --- kernel/arm64/dgemm_beta.S | 69 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S index 1ce452212..20011c343 100644 --- a/kernel/arm64/dgemm_beta.S +++ b/kernel/arm64/dgemm_beta.S @@ -80,6 +80,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add sp, sp, #(11*16) .endm +.macro INIT_ZERO + fmul v0.2d, v0.2d, betaV0 + fmul v1.2d, v1.2d, betaV0 + fmul v2.2d, v2.2d, betaV0 + fmul v3.2d, v3.2d, betaV0 + fmul v4.2d, v4.2d, betaV0 + fmul v5.2d, v5.2d, betaV0 + fmul v6.2d, v6.2d, betaV0 + fmul v7.2d, v7.2d, betaV0 +.endm + /************************************************************************************** * End of macro definitions **************************************************************************************/ @@ -97,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble .Lgemm_beta_L999 + fcmp BETA, #0.0 + beq .Lgemm_beta_zero_01 + .Lgemm_beta_01: lsl LDC, LDC, #3 @@ -180,4 +194,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RESTORE_REGS ret +.Lgemm_beta_zero_01: + INIT_ZERO + lsl LDC, LDC, #3 + + .align 5 +.Lgemm_beta_zero_02: + mov A01, C00 + add C00, C00, LDC + + asr I, M, #4 + cmp I, #0 + ble .Lgemm_beta_zero_04 + + add A02, A01, #64 + + .align 5 +.Lgemm_beta_zero_03: + + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [A01] + add A01, A01, calc_size + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [A02] + add A02, A02, calc_size + + subs I, I, #1 + bne .Lgemm_beta_zero_03 + + .align 5 +.Lgemm_beta_zero_04: + + and I, M, #15 + cmp I, #0 + ble .Lgemm_beta_zero_06 + + .align 5 +.Lgemm_beta_zero_05: + + str beta0, [A01] + add A01, A01, #8 + + subs I, I, #1 + bne .Lgemm_beta_zero_05 + + .align 5 +.Lgemm_beta_zero_06: + + subs N, N, #1 + bne .Lgemm_beta_zero_02 + + .align 5 +.Lgemm_beta_zero_L999: + + mov x0, #0 + RESTORE_REGS + ret + EPILOGUE From 893e6e57c46c5f2768468def5c8a77723c82df4a Mon Sep 17 00:00:00 2001 From: shengyang Date: Fri, 3 Jan 2020 10:03:33 +0800 Subject: [PATCH 839/935] modified: ctest/din3 ctest/sin3 --- ctest/din3 | 2 +- ctest/sin3 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/din3 b/ctest/din3 index 23fedfe32..9919774ac 100644 --- a/ctest/din3 +++ b/ctest/din3 @@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO -6 NUMBER OF VALUES OF N +7 NUMBER OF VALUES OF N 1 2 3 5 7 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA diff --git a/ctest/sin3 b/ctest/sin3 index 644083f22..b74206b70 100644 --- a/ctest/sin3 +++ b/ctest/sin3 @@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO -6 NUMBER OF VALUES OF N +7 NUMBER OF VALUES OF N 0 1 2 3 5 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA From 2ea2bd99c7ba038d5366e8309359d32f0d5f6cd7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2020 11:10:00 +0100 Subject: [PATCH 840/935] Apply LAPACKE fix for eigenvector transposition in symmetric eigensolvers from Reference-LAPACK PR 330 --- lapack-netlib/LAPACKE/src/lapacke_cheev_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c | 7 +++++-- lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_zheev_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c | 6 +++++- lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c | 6 +++++- 12 files changed, 60 insertions(+), 13 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c index f505dfab0..aa78e678e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c @@ -78,7 +78,11 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c index e9e6a5d1d..d26c84785 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c @@ -79,7 +79,11 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c index 4c5f352a8..e8f212efb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c @@ -79,8 +79,11 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); - + if ( jobz == 'V') { + LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c index 5a416ff45..f696c608f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c @@ -72,7 +72,11 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c index 90d8ce8dc..6f9c02f6a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c @@ -76,7 +76,11 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c index fff476445..81ba2acb3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c @@ -76,7 +76,11 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c index 6a2f8fce3..abd62ddf3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c @@ -72,7 +72,11 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c index 9394f822f..d9fe47599 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c @@ -76,7 +76,11 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c index 12d9e84e6..bfbf49aee 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c @@ -76,7 +76,11 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c index ce278b272..d4e93aed2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c @@ -78,7 +78,11 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c index bf2e2c828..fb33c3e2a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c @@ -79,7 +79,11 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c index f09cfe49d..5af2a1269 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c @@ -79,7 +79,11 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + if ( jobz == 'V') { + LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); + } else { + LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); + } /* Release memory and exit */ LAPACKE_free( a_t ); exit_level_0: From eb3c9f1db94543367cb32a6d656635dd3b99d38c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 6 Jan 2020 12:07:02 +0800 Subject: [PATCH 841/935] optimize AVX2 SGEMM --- kernel/x86_64/sgemm_kernel_8x4_haswell.c | 490 +++++++++++++++++++++++ 1 file changed, 490 insertions(+) create mode 100644 kernel/x86_64/sgemm_kernel_8x4_haswell.c diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c new file mode 100644 index 000000000..87d9aa394 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c @@ -0,0 +1,490 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = &alpha, %7 = b_pref */ +/* r11 = m_counter, r12 = k << 2(const), r13 = k_skip << 2, r14 = b_head_pos(const), r15 for assisting prefetch */ + +//recommended settings: GEMM_P = 320, GEMM_Q = 320. + +#ifdef TRMMKERNEL + #define mult_alpha(acc,alpha,...) "vmulps "#acc","#alpha","#acc";" +#else + #define mult_alpha(acc,alpha,...) "vfmadd213ps ("#__VA_ARGS__"),"#alpha","#acc";" +#endif + +#if defined TRMMKERNEL && !defined LEFT + #ifdef TRANSA + #define HEAD_SET_OFFSET(ndim) {} + #define TAIL_SET_OFFSET(ndim) {off+=ndim;} + #else + #define HEAD_SET_OFFSET(ndim) {off+=(ndim>4?4:ndim);} + #define TAIL_SET_OFFSET(ndim) {off+=(ndim>4?(ndim-4):0);} + #endif +#else + #define HEAD_SET_OFFSET(ndim) {} + #define TAIL_SET_OFFSET(ndim) {} +#endif + +#if defined TRMMKERNEL && defined LEFT + #ifdef TRANSA + #define init_update_kskip(val) "subq $"#val",%%r13;" + #define save_update_kskip(val) "" + #else + #define init_update_kskip(val) "" + #define save_update_kskip(val) "addq $"#val",%%r13;" + #endif +#else + #define init_update_kskip(val) "" + #define save_update_kskip(val) "" +#endif + +#ifdef TRMMKERNEL + #define init_set_k "movq %%r12,%4; subq %%r13,%4;" + #if LEFT != TRANSA + #define INIT_SET_KSKIP "movq %9,%%r13; salq $2,%%r13;" + #define init_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;" + #define save_set_pointers(a_copy,b_copy) "" + #else + #define INIT_SET_KSKIP "movq %4,%%r13; subq %9,%%r13; salq $2,%%r13;" + #define init_set_pointers(a_copy,b_copy) "" + #define save_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;" + #endif +#else + #define INIT_SET_KSKIP "xorq %%r13,%%r13;" + #define init_set_k "movq %%r12,%4;" + #define init_set_pointers(a_copy,b_copy) "" + #define save_set_pointers(a_copy,b_copy) "" +#endif +#define init_set_pa_pb_n12(mdim) init_set_pointers(mdim,4) +#define init_set_pa_pb_n8(mdim) init_set_pointers(mdim,4) +#define init_set_pa_pb_n4(mdim) init_set_pointers(mdim,4) +#define init_set_pa_pb_n2(mdim) init_set_pointers(mdim,2) +#define init_set_pa_pb_n1(mdim) init_set_pointers(mdim,1) +#define save_set_pa_pb_n12(mdim) save_set_pointers(mdim,4) +#define save_set_pa_pb_n8(mdim) save_set_pointers(mdim,4) +#define save_set_pa_pb_n4(mdim) save_set_pointers(mdim,4) +#define save_set_pa_pb_n2(mdim) save_set_pointers(mdim,2) +#define save_set_pa_pb_n1(mdim) save_set_pointers(mdim,1) + +#if defined TRMMKERNEL && !defined LEFT && defined TRANSA + #define kernel_kstart_n8(mdim) \ + KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $16,%4;" + #define kernel_kstart_n12(mdim) \ + KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4\ + KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 "subq $32,%4;" +#else + #define kernel_kstart_n8(mdim) "" + #define kernel_kstart_n12(mdim) "" +#endif +#define kernel_kstart_n4(mdim) "" +#define kernel_kstart_n2(mdim) "" +#define kernel_kstart_n1(mdim) "" + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff1,boff2,...) \ + "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,8,%1,%%r12,4) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,8,%1,%%r12,8) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define KERNEL_k2m8n1 KERNEL_k1m8n1 KERNEL_k1m8n1 +#define KERNEL_k2m8n2 KERNEL_k1m8n2 KERNEL_k1m8n2 +#define KERNEL_k2m8n4 KERNEL_k1m8n4 KERNEL_k1m8n4 +#define KERNEL_k2m8n8 KERNEL_k1m8n8 KERNEL_k1m8n8 +#define KERNEL_k2m8n12 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,8,%1)\ + unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,8,%1,%%r12,4)\ + unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,8,%1,%%r12,8)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\ + unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,16,24,%1)\ + unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,24,%1,%%r12,4)\ + unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,16,24,%1,%%r12,8) "addq $32,%1;" +#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA + #define unit_kernel_endn4_k1m8n8(offa1,offb1,offb2) \ + "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\ + unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,offb1,offb2,%1,%%r12,4) + #define unit_kernel_endn4_k1m8n12(offa1,offb1,offb2) \ + "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\ + unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,offb1,offb2,%1,%%r12,8) + #define unit_kernel_endn8_k1m8n12(offa1,offb1,offb2) unit_kernel_endn4_k1m8n8(offa1,offb1,offb2)\ + unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,offb1,offb2,%1,%%r12,8) + #define kernel_kend_m8n8 \ + unit_kernel_endn4_k1m8n8(0,0,8) unit_kernel_endn4_k1m8n8(32,16,24)\ + unit_kernel_endn4_k1m8n8(64,32,40) unit_kernel_endn4_k1m8n8(96,48,56) + #define kernel_kend_m8n12 \ + unit_kernel_endn8_k1m8n12(0,0,8) unit_kernel_endn8_k1m8n12(32,16,24)\ + unit_kernel_endn8_k1m8n12(64,32,40) unit_kernel_endn8_k1m8n12(96,48,56)\ + unit_kernel_endn4_k1m8n12(128,64,72) unit_kernel_endn4_k1m8n12(160,80,88)\ + unit_kernel_endn4_k1m8n12(192,96,104) unit_kernel_endn4_k1m8n12(224,112,120) +#else + #define kernel_kend_m8n8 "" + #define kernel_kend_m8n12 "" +#endif +#define kernel_kend_m8n4 "" +#define kernel_kend_m8n2 "" +#define kernel_kend_m8n1 "" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_m8n1 mult_alpha(%%ymm4,%%ymm0,%2) "vmovups %%ymm4,(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\ + mult_alpha(c1,%%ymm0,%5) "vmovups "#c1",(%5);"\ + mult_alpha(c2,%%ymm0,%5,%3,1) "vmovups "#c2",(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define COMPUTE_m8(ndim) \ + init_update_kskip(32) INIT_m8n##ndim\ + init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(8) "movq %2,%5; movq $0,%%r15;"\ + kernel_kstart_n##ndim(8)\ + "cmpq $64,%4; jb "#ndim"882f;"\ + #ndim"881:\n\t"\ + "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\ + KERNEL_k2m8n##ndim KERNEL_k2m8n##ndim\ + "prefetcht1 (%5); subq $31,%5;"\ + KERNEL_k2m8n##ndim KERNEL_k2m8n##ndim\ + "addq %%r15,%5; prefetcht1 (%7); addq $16,%7;"\ + "subq $32,%4; cmpq $64,%4; jnb "#ndim"881b;"\ + "movq %2,%5;"\ + #ndim"882:\n\t"\ + "testq %4,%4; jz "#ndim"883f;"\ + "prefetcht0 (%5); prefetcht0 31(%5);"\ + KERNEL_k1m8n##ndim\ + "prefetcht0 (%5,%3,4); prefetcht0 31(%5,%3,4); addq %3,%5;"\ + "subq $4,%4; jmp "#ndim"882b;"\ + #ndim"883:\n\t"\ + kernel_kend_m8n##ndim "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ + save_set_pa_pb_n##ndim(8) SAVE_m8n##ndim "addq $32,%2;" save_update_kskip(32) + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,offb1,offb2,...) \ + "vmovddup "#offb1"("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup "#offb2"("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,0,8,%1,%%r12,4) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,0,8,%1,%%r12,8) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA + #define unit_kernel_endn4_k1m4n8(offa1,offb1,offb2) \ + "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\ + unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,offb1,offb2,%1,%%r12,4) + #define unit_kernel_endn4_k1m4n12(offa1,offb1,offb2) \ + "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\ + unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,offb1,offb2,%1,%%r12,8) + #define unit_kernel_endn8_k1m4n12(offa1,offb1,offb2) unit_kernel_endn4_k1m4n8(offa1,offb1,offb2)\ + unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,offb1,offb2,%1,%%r12,8) + #define kernel_kend_m4n8 \ + unit_kernel_endn4_k1m4n8(0,0,8) unit_kernel_endn4_k1m4n8(16,16,24)\ + unit_kernel_endn4_k1m4n8(32,32,40) unit_kernel_endn4_k1m4n8(48,48,56) + #define kernel_kend_m4n12 \ + unit_kernel_endn8_k1m4n12(0,0,8) unit_kernel_endn8_k1m4n12(16,16,24)\ + unit_kernel_endn8_k1m4n12(32,32,40) unit_kernel_endn8_k1m4n12(48,48,56)\ + unit_kernel_endn4_k1m4n12(64,64,72) unit_kernel_endn4_k1m4n12(80,80,88)\ + unit_kernel_endn4_k1m4n12(96,96,104) unit_kernel_endn4_k1m4n12(112,112,120) +#else + #define kernel_kend_m4n8 "" + #define kernel_kend_m4n12 "" +#endif +#define kernel_kend_m4n4 "" +#define kernel_kend_m4n2 "" +#define kernel_kend_m4n1 "" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 \ + mult_alpha(%%xmm4,%%xmm0,%2) "vmovups %%xmm4,(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\ + mult_alpha(c1,%%xmm0,%5) "vmovups "#c1",(%5);"\ + mult_alpha(c2,%%xmm0,%5,%3,1) "vmovups "#c2",(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) +#define COMPUTE_m4(ndim) \ + init_update_kskip(16) INIT_m4n##ndim\ + init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(4)\ + kernel_kstart_n##ndim(4)\ + #ndim"442:\n\t"\ + "testq %4,%4; jz "#ndim"443f;"\ + KERNEL_k1m4n##ndim\ + "subq $4,%4; jmp "#ndim"442b;"\ + #ndim"443:\n\t"\ + kernel_kend_m4n##ndim save_set_pa_pb_n##ndim(4) SAVE_m4n##ndim "addq $16,%2;" save_update_kskip(16) + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#ifdef TRMMKERNEL + #define SAVE_m2n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#else + #define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#endif +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#ifdef TRMMKERNEL + #define SAVE_m2n2 SAVE_m2n1 "vmulps %%xmm5,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#else + #define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#endif +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA + #define unit_kernel_endn4_k1m2n8(aoff1,aoff2,boff) \ + "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\ + "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;"\ + "vbroadcastss "#aoff2"(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm7;" + #define unit_kernel_endn4_k1m2n12(aoff1,aoff2,boff) \ + "vmovups "#boff"(%1,%%r12,8),%%xmm3;"\ + "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm8;"\ + "vbroadcastss "#aoff2"(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm9;" + #define unit_kernel_endn8_k1m2n12(aoff1,aoff2,boff) \ + "vmovups "#boff"(%1,%%r12,4),%%xmm3; vmovups "#boff"(%1,%%r12,8),%%xmm2;"\ + "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6; vfmadd231ps %%xmm2,%%xmm1,%%xmm8;"\ + "vbroadcastss "#aoff2"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm7; vfmadd231ps %%xmm2,%%xmm1,%%xmm9;" + #define kernel_kend_m2n8 \ + unit_kernel_endn4_k1m2n8(0,4,0) unit_kernel_endn4_k1m2n8(8,12,16)\ + unit_kernel_endn4_k1m2n8(16,20,32) unit_kernel_endn4_k1m2n8(24,28,48) + #define kernel_kend_m2n12 \ + unit_kernel_endn8_k1m2n12(0,4,0) unit_kernel_endn8_k1m2n12(8,12,16)\ + unit_kernel_endn8_k1m2n12(16,20,32) unit_kernel_endn8_k1m2n12(24,28,48)\ + unit_kernel_endn4_k1m2n12(32,36,64) unit_kernel_endn4_k1m2n12(40,44,80)\ + unit_kernel_endn4_k1m2n12(48,52,96) unit_kernel_endn4_k1m2n12(56,60,112) +#else + #define kernel_kend_m2n8 "" + #define kernel_kend_m2n12 "" +#endif +#define kernel_kend_m2n4 "" +#define kernel_kend_m2n2 "" +#define kernel_kend_m2n1 "" +#ifdef TRMMKERNEL + #define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmulps %%xmm1,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmulps %%xmm2,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#else + #define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#endif +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define COMPUTE_m2(ndim) \ + init_update_kskip(8) INIT_m2n##ndim\ + init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(2)\ + kernel_kstart_n##ndim(2)\ + #ndim"222:\n\t"\ + "testq %4,%4; jz "#ndim"223f;"\ + KERNEL_k1m2n##ndim\ + "subq $4,%4; jmp "#ndim"222b;"\ + #ndim"223:\n\t"\ + kernel_kend_m2n##ndim save_set_pa_pb_n##ndim(2) SAVE_m2n##ndim "addq $8,%2;" save_update_kskip(8) + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#ifdef TRMMKERNEL + #define SAVE_m1n1 "vmulss %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#else + #define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#endif +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#ifdef TRMMKERNEL + #define SAVE_m1n2 \ + "vmulps %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#else + #define SAVE_m1n2 \ + "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#endif +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA + #define unit_kernel_endn4_k1m1n8(aoff,boff) \ + "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\ + "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5;" + #define unit_kernel_endn4_k1m1n12(aoff,boff) \ + "vmovups "#boff"(%1,%%r12,8),%%xmm3;"\ + "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;" + #define unit_kernel_endn8_k1m1n12(aoff,boff) \ + "vmovups "#boff"(%1,%%r12,4),%%xmm3; vmovups "#boff"(%1,%%r12,8),%%xmm2;"\ + "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;" + #define kernel_kend_m1n8 \ + unit_kernel_endn4_k1m1n8(0,0) unit_kernel_endn4_k1m1n8(4,16)\ + unit_kernel_endn4_k1m1n8(8,32) unit_kernel_endn4_k1m1n8(12,48) + #define kernel_kend_m1n12 \ + unit_kernel_endn8_k1m1n12(0,0) unit_kernel_endn8_k1m1n12(4,16)\ + unit_kernel_endn8_k1m1n12(8,32) unit_kernel_endn8_k1m1n12(12,48)\ + unit_kernel_endn4_k1m1n12(16,64) unit_kernel_endn4_k1m1n12(20,80)\ + unit_kernel_endn4_k1m1n12(24,96) unit_kernel_endn4_k1m1n12(28,112) +#else + #define kernel_kend_m1n8 "" + #define kernel_kend_m1n12 "" +#endif +#define kernel_kend_m1n4 "" +#define kernel_kend_m1n2 "" +#define kernel_kend_m1n1 "" +#ifdef TRMMKERNEL + #define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmulps %%xmm2,%%xmm0,%%xmm2; vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmulps %%xmm1,%%xmm0,%%xmm1; vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#else + #define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#endif +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) +#define COMPUTE_m1(ndim) \ + init_update_kskip(4) INIT_m1n##ndim\ + init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(1)\ + kernel_kstart_n##ndim(1)\ + #ndim"112:\n\t"\ + "testq %4,%4; jz "#ndim"113f;"\ + KERNEL_k1m1n##ndim\ + "subq $4,%4; jmp "#ndim"112b;"\ + #ndim"113:\n\t"\ + kernel_kend_m1n##ndim save_set_pa_pb_n##ndim(1) SAVE_m1n##ndim "addq $4,%2;" save_update_kskip(4) + +#define COMPUTE(ndim) {\ + HEAD_SET_OFFSET(ndim) next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastss (%6),%%ymm0;"\ + "movq %4,%%r12; salq $2,%%r12; movq %1,%%r14; movq %8,%%r11;" INIT_SET_KSKIP\ + "cmpq $8,%%r11;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%%r11;jb 33103"#ndim"f;"\ + COMPUTE_m4(ndim)\ + "subq $4,%%r11;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%%r11;jb 33104"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%%r11;"\ + "33104"#ndim":\n\t"\ + "testq %%r11,%%r11;jz 33105"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(next_b)\ + :"m"(M),"m"(off):"r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + TAIL_SET_OFFSET(ndim) a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\ +} + +#include "common.h" +#include +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC +#ifdef TRMMKERNEL +,BLASLONG offset +#endif +){ + if(m==0||n==0||k==0||alpha==0.0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); + float constval = alpha; + float *const_val=&constval; + int64_t M = (int64_t)m, K = (int64_t)k, off = 0; +#ifdef TRMMKERNEL + #ifdef LEFT + off = offset; + #else + off = -offset; + #endif +#endif + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From b73bf01378d179b30fc515714d504ffeeb5360cd Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 6 Jan 2020 12:09:14 +0800 Subject: [PATCH 842/935] optimize AVX2 SGEMM --- kernel/x86_64/KERNEL.HASWELL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9e30c12f2..d24b7f3b3 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -31,11 +31,11 @@ DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c -STRMMKERNEL = sgemm_kernel_16x4_haswell.S -SGEMMKERNEL = sgemm_kernel_16x4_haswell.S +STRMMKERNEL = sgemm_kernel_8x4_haswell.c +SGEMMKERNEL = sgemm_kernel_8x4_haswell.c SGEMM_BETA = sgemm_beta_skylakex.c -SGEMMINCOPY = ../generic/gemm_ncopy_16.c -SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) From 92b10212de6972c808ebeccfe9fac0a82012e94e Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 6 Jan 2020 12:11:21 +0800 Subject: [PATCH 843/935] optimize AVX2 SGEMM --- kernel/x86_64/KERNEL.ZEN | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 98cd38dfa..7cec2e5ed 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -30,10 +30,10 @@ DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c -STRMMKERNEL = sgemm_kernel_16x4_haswell.S -SGEMMKERNEL = sgemm_kernel_16x4_haswell.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c -SGEMMITCOPY = ../generic/gemm_tcopy_16.c +STRMMKERNEL = sgemm_kernel_8x4_haswell.c +SGEMMKERNEL = sgemm_kernel_8x4_haswell.c +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) From b7b408a12018c2ffde0c595bb588d464389b74e3 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 6 Jan 2020 12:16:09 +0800 Subject: [PATCH 844/935] optimize AVX2 SGEMM --- param.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/param.h b/param.h index d03e60fcb..70c5945ae 100644 --- a/param.h +++ b/param.h @@ -625,7 +625,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else -#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -666,7 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else -#define SGEMM_DEFAULT_P 768 +#define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 192 @@ -675,7 +675,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 128 #else -#define SGEMM_DEFAULT_Q 384 +#define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 256 #endif #define CGEMM_DEFAULT_Q 256 @@ -1528,7 +1528,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else -#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -1569,7 +1569,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else -#define SGEMM_DEFAULT_P 768 +#define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 192 @@ -1578,7 +1578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 128 #else -#define SGEMM_DEFAULT_Q 384 +#define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 256 #endif #define CGEMM_DEFAULT_Q 256 From 9f5cdc49d4b757618265bef98ad6bd354ad03012 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 6 Jan 2020 12:28:43 +0800 Subject: [PATCH 845/935] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 9829c31f9..df497c1d2 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -179,3 +179,4 @@ In chronological order: * [2019-11-12] AVX512 CGEMM & ZGEMM kernels * [2019-12-23] optimize AVX2 CGEMM and ZGEMM * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels + * [2020-01-07] optimize AVX2 SGEMM and STRMM From 9dc9b7b95ec2a97f00f0e920906cc1f672938a11 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 6 Jan 2020 20:11:36 +0800 Subject: [PATCH 846/935] Update sgemm_kernel_8x4_haswell.c --- kernel/x86_64/sgemm_kernel_8x4_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c index 87d9aa394..9b3ba7632 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c @@ -37,7 +37,7 @@ #ifdef TRMMKERNEL #define init_set_k "movq %%r12,%4; subq %%r13,%4;" - #if LEFT != TRANSA + #if (defined LEFT && !defined TRANSA) || (!defined LEFT && defined TRANSA) #define INIT_SET_KSKIP "movq %9,%%r13; salq $2,%%r13;" #define init_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;" #define save_set_pointers(a_copy,b_copy) "" From bd4c032f52fb6bd1b8f6352baf23836c00842f05 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 7 Jan 2020 11:22:46 +0800 Subject: [PATCH 847/935] Update sgemm_kernel_8x4_haswell.c --- kernel/x86_64/sgemm_kernel_8x4_haswell.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c index 9b3ba7632..2b8aa9862 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c @@ -9,7 +9,7 @@ #define mult_alpha(acc,alpha,...) "vfmadd213ps ("#__VA_ARGS__"),"#alpha","#acc";" #endif -#if defined TRMMKERNEL && !defined LEFT +#if defined(TRMMKERNEL) && !defined(LEFT) #ifdef TRANSA #define HEAD_SET_OFFSET(ndim) {} #define TAIL_SET_OFFSET(ndim) {off+=ndim;} @@ -22,7 +22,7 @@ #define TAIL_SET_OFFSET(ndim) {} #endif -#if defined TRMMKERNEL && defined LEFT +#if defined(TRMMKERNEL) && defined(LEFT) #ifdef TRANSA #define init_update_kskip(val) "subq $"#val",%%r13;" #define save_update_kskip(val) "" @@ -37,7 +37,7 @@ #ifdef TRMMKERNEL #define init_set_k "movq %%r12,%4; subq %%r13,%4;" - #if (defined LEFT && !defined TRANSA) || (!defined LEFT && defined TRANSA) + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) #define INIT_SET_KSKIP "movq %9,%%r13; salq $2,%%r13;" #define init_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;" #define save_set_pointers(a_copy,b_copy) "" @@ -63,7 +63,7 @@ #define save_set_pa_pb_n2(mdim) save_set_pointers(mdim,2) #define save_set_pa_pb_n1(mdim) save_set_pointers(mdim,1) -#if defined TRMMKERNEL && !defined LEFT && defined TRANSA +#if defined(TRMMKERNEL) && !defined(LEFT) && defined(TRANSA) #define kernel_kstart_n8(mdim) \ KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $16,%4;" #define kernel_kstart_n12(mdim) \ @@ -109,7 +109,7 @@ unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,16,24,%1)\ unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,24,%1,%%r12,4)\ unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,16,24,%1,%%r12,8) "addq $32,%1;" -#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA +#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA) #define unit_kernel_endn4_k1m8n8(offa1,offb1,offb2) \ "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\ unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,offb1,offb2,%1,%%r12,4) @@ -192,7 +192,7 @@ #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,0,8,%1,%%r12,8) #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" -#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA +#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA) #define unit_kernel_endn4_k1m4n8(offa1,offb1,offb2) \ "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\ unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,offb1,offb2,%1,%%r12,4) @@ -285,7 +285,7 @@ "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ "addq $8,%0;" -#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA +#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA) #define unit_kernel_endn4_k1m2n8(aoff1,aoff2,boff) \ "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\ "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;"\ @@ -379,7 +379,7 @@ "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ "addq $4,%0;" -#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA +#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA) #define unit_kernel_endn4_k1m1n8(aoff,boff) \ "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\ "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5;" From 3a100b2797b62c1fc1341a668accda12137807da Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 9 Jan 2020 13:48:41 +0800 Subject: [PATCH 848/935] Update KERNEL.SKYLAKEX --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d5d32d1b3..0e6275748 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,7 +1,7 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c - +STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c From daa4310db5c8abf82233810c240eacd640e78206 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jan 2020 22:00:50 +0100 Subject: [PATCH 849/935] Install new lapack.h new file in LAPACK 3.9.0, split off from lapacke.h --- Makefile.install | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.install b/Makefile.install index 8070b4729..e01d866c9 100644 --- a/Makefile.install +++ b/Makefile.install @@ -51,6 +51,7 @@ endif ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" @@ -100,6 +101,7 @@ else #install on AIX has different options syntax ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" From 1c675670081422b8a3d7f0998dfd7d1454c0d2bd Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 13 Jan 2020 16:26:03 +0800 Subject: [PATCH 850/935] improve skylakex paralleled sgemm performance --- param.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/param.h b/param.h index 70c5945ae..3baae31cf 100644 --- a/param.h +++ b/param.h @@ -1690,18 +1690,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else -#define SGEMM_DEFAULT_P 768 +#define SGEMM_DEFAULT_P 640 #define DGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 -#ifdef WINDOWS_ABI -#define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 168 -#else -#define SGEMM_DEFAULT_Q 192 +#define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 168 -#endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 From feaafbedd347871b3f25a018e6655fa9af6d141c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 13 Jan 2020 16:28:41 +0800 Subject: [PATCH 851/935] make skylakex sgemm code more friendly for readers BTW some kernels were adjusted to improve performance --- kernel/x86_64/sgemm_direct_skylakex.c | 467 ++++++++++++ kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 465 +----------- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 715 +++---------------- 3 files changed, 576 insertions(+), 1071 deletions(-) create mode 100644 kernel/x86_64/sgemm_direct_skylakex.c diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c new file mode 100644 index 000000000..4f9af6e57 --- /dev/null +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -0,0 +1,467 @@ + +/* the direct sgemm code written by Arjan van der Ven */ +#include + +/* + * "Direct sgemm" code. This code operates directly on the inputs and outputs + * of the sgemm call, avoiding the copies, memory realignments and threading, + * and only supports alpha = 1 and beta = 0. + * This is a common case and provides value for relatively small matrixes. + * For larger matrixes the "regular" sgemm code is superior, there the cost of + * copying/shuffling the B matrix really pays off. + */ + + + +#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)]) +#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M) + + +#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps() +#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)]) +#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M) + +#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps() +#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) +#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)]) +#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M) +#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M) + +#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0; +#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)]; +#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N]; +#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; +#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; + +int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) +{ + unsigned long long mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + + +void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{ + int i, j, k; + + int m4 = M & ~3; + int m2 = M & ~1; + + int n64 = N & ~63; + int n32 = N & ~31; + int n16 = N & ~15; + int n8 = N & ~7; + int n4 = N & ~3; + int n2 = N & ~1; + + i = 0; + + for (i = 0; i < m4; i+=4) { + + for (j = 0; j < n64; j+= 64) { + k = 0; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + BROADCAST_LOAD_A_512(x, 2); + BROADCAST_LOAD_A_512(x, 3); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + DECLARE_RESULT_256(0, 2); + DECLARE_RESULT_256(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + BROADCAST_LOAD_A_256(x, 2); + BROADCAST_LOAD_A_256(x, 3); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + MATMUL_256(0, 2); + MATMUL_256(0, 3); + } + STORE_256(0, 0); + STORE_256(0, 1); + STORE_256(0, 2); + STORE_256(0, 3); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + DECLARE_RESULT_128(0, 2); + DECLARE_RESULT_128(0, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + BROADCAST_LOAD_A_128(x, 2); + BROADCAST_LOAD_A_128(x, 3); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + MATMUL_128(0, 2); + MATMUL_128(0, 3); + } + STORE_128(0, 0); + STORE_128(0, 1); + STORE_128(0, 2); + STORE_128(0, 3); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2); + DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + BROADCAST_LOAD_A_SCALAR(x, 2); + BROADCAST_LOAD_A_SCALAR(x, 3); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2); + MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + STORE_SCALAR(0, 2); STORE_SCALAR(1, 2); + STORE_SCALAR(0, 3); STORE_SCALAR(1, 3); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0) + DECLARE_RESULT_SCALAR(0, 1) + DECLARE_RESULT_SCALAR(0, 2) + DECLARE_RESULT_SCALAR(0, 3) + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + BROADCAST_LOAD_A_SCALAR(0, 2); + BROADCAST_LOAD_A_SCALAR(0, 3); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + MATMUL_SCALAR(0, 2); + MATMUL_SCALAR(0, 3); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + STORE_SCALAR(0, 2); + STORE_SCALAR(0, 3); + } + } + + for (; i < m2; i+=2) { + j = 0; + + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); LOAD_B_512(1, x); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + BROADCAST_LOAD_A_512(x, 1); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + BROADCAST_LOAD_A_256(x, 1); + + LOAD_B_256(0, x); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + } + STORE_256(0, 0); + STORE_256(0, 1); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + BROADCAST_LOAD_A_128(x, 1); + + LOAD_B_128(0, x); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + } + STORE_128(0, 0); + STORE_128(0, 1); + } + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + BROADCAST_LOAD_A_SCALAR(x, 1); + + LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); + + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + DECLARE_RESULT_SCALAR(0, 1); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + BROADCAST_LOAD_A_SCALAR(0, 1); + + LOAD_B_SCALAR(0, 0); + + MATMUL_SCALAR(0, 0); + MATMUL_SCALAR(0, 1); + } + STORE_SCALAR(0, 0); + STORE_SCALAR(0, 1); + } + } + + for (; i < M; i+=1) { + j = 0; + for (; j < n64; j+= 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + for (; j < n32; j+= 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + LOAD_B_512(0, x); LOAD_B_512(1, x); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + + + for (; j < n16; j+= 16) { + DECLARE_RESULT_512(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(x, 0); + + LOAD_B_512(0, x); + + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + + for (; j < n8; j+= 8) { + DECLARE_RESULT_256(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_256(x, 0); + LOAD_B_256(0, x); + MATMUL_256(0, 0); + } + STORE_256(0, 0); + } + + for (; j < n4; j+= 4) { + DECLARE_RESULT_128(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_128(x, 0); + LOAD_B_128(0, x); + MATMUL_128(0, 0); + } + STORE_128(0, 0); + } + + for (; j < n2; j+= 2) { + DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(x, 0); + LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0); + MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); + } + STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); + } + + for (; j < N; j++) { + DECLARE_RESULT_SCALAR(0, 0); + + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_SCALAR(0, 0); + LOAD_B_SCALAR(0, 0); + MATMUL_SCALAR(0, 0); + } + STORE_SCALAR(0, 0); + } + } +} diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 76b82e65b..d174bbcc3 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -1176,467 +1176,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo return 0; } - -/* - * "Direct sgemm" code. This code operates directly on the inputs and outputs - * of the sgemm call, avoiding the copies, memory realignments and threading, - * and only supports alpha = 1 and beta = 0. - * This is a common case and provides value for relatively small matrixes. - * For larger matrixes the "regular" sgemm code is superior, there the cost of - * copying/shuffling the B matrix really pays off. - */ - - - -#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps() -#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) -#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)]) -#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M) -#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M) - - -#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps() -#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) -#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)]) -#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M) -#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M) - -#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps() -#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) -#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)]) -#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M) -#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M) - -#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0; -#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)]; -#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N]; -#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; -#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; - -int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) -{ - unsigned long long mnk = M * N * K; - /* large matrixes -> not performant */ - if (mnk >= 28 * 512 * 512) - return 0; - - /* - * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, - * and the regular sgemm copy/realignment of data pays off much quicker - */ - if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) - return 0; - -#ifdef SMP - /* if we can run multithreaded, the threading changes the based threshold */ - if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) - return 0; -#endif - - return 1; -} - - - -void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) -{ - int i, j, k; - - int m4 = M & ~3; - int m2 = M & ~1; - - int n64 = N & ~63; - int n32 = N & ~31; - int n16 = N & ~15; - int n8 = N & ~7; - int n4 = N & ~3; - int n2 = N & ~1; - - i = 0; - - for (i = 0; i < m4; i+=4) { - - for (j = 0; j < n64; j+= 64) { - k = 0; - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); - DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); - DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); - - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - BROADCAST_LOAD_A_512(x, 2); - BROADCAST_LOAD_A_512(x, 3); - - LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); - MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); - MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); - } - STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); - STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); - STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); - STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); - } - - for (; j < n32; j+= 32) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); - DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); - DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - BROADCAST_LOAD_A_512(x, 2); - BROADCAST_LOAD_A_512(x, 3); - - LOAD_B_512(0, x); LOAD_B_512(1, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); - MATMUL_512(0, 2); MATMUL_512(1, 2); - MATMUL_512(0, 3); MATMUL_512(1, 3); - } - STORE_512(0, 0); STORE_512(1, 0); - STORE_512(0, 1); STORE_512(1, 1); - STORE_512(0, 2); STORE_512(1, 2); - STORE_512(0, 3); STORE_512(1, 3); - } - - for (; j < n16; j+= 16) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - DECLARE_RESULT_512(0, 2); - DECLARE_RESULT_512(0, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - BROADCAST_LOAD_A_512(x, 2); - BROADCAST_LOAD_A_512(x, 3); - - LOAD_B_512(0, x); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - MATMUL_512(0, 2); - MATMUL_512(0, 3); - } - STORE_512(0, 0); - STORE_512(0, 1); - STORE_512(0, 2); - STORE_512(0, 3); - } - - for (; j < n8; j+= 8) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - DECLARE_RESULT_256(0, 2); - DECLARE_RESULT_256(0, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_256(x, 0); - BROADCAST_LOAD_A_256(x, 1); - BROADCAST_LOAD_A_256(x, 2); - BROADCAST_LOAD_A_256(x, 3); - - LOAD_B_256(0, x); - - MATMUL_256(0, 0); - MATMUL_256(0, 1); - MATMUL_256(0, 2); - MATMUL_256(0, 3); - } - STORE_256(0, 0); - STORE_256(0, 1); - STORE_256(0, 2); - STORE_256(0, 3); - } - - for (; j < n4; j+= 4) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - DECLARE_RESULT_128(0, 2); - DECLARE_RESULT_128(0, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_128(x, 0); - BROADCAST_LOAD_A_128(x, 1); - BROADCAST_LOAD_A_128(x, 2); - BROADCAST_LOAD_A_128(x, 3); - - LOAD_B_128(0, x); - - MATMUL_128(0, 0); - MATMUL_128(0, 1); - MATMUL_128(0, 2); - MATMUL_128(0, 3); - } - STORE_128(0, 0); - STORE_128(0, 1); - STORE_128(0, 2); - STORE_128(0, 3); - } - - for (; j < n2; j+= 2) { - DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); - DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); - DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2); - DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(x, 0); - BROADCAST_LOAD_A_SCALAR(x, 1); - BROADCAST_LOAD_A_SCALAR(x, 2); - BROADCAST_LOAD_A_SCALAR(x, 3); - - LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); - - MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); - MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); - MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2); - MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3); - } - STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); - STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); - STORE_SCALAR(0, 2); STORE_SCALAR(1, 2); - STORE_SCALAR(0, 3); STORE_SCALAR(1, 3); - } - - for (; j < N; j++) { - DECLARE_RESULT_SCALAR(0, 0) - DECLARE_RESULT_SCALAR(0, 1) - DECLARE_RESULT_SCALAR(0, 2) - DECLARE_RESULT_SCALAR(0, 3) - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(0, 0); - BROADCAST_LOAD_A_SCALAR(0, 1); - BROADCAST_LOAD_A_SCALAR(0, 2); - BROADCAST_LOAD_A_SCALAR(0, 3); - - LOAD_B_SCALAR(0, 0); - - MATMUL_SCALAR(0, 0); - MATMUL_SCALAR(0, 1); - MATMUL_SCALAR(0, 2); - MATMUL_SCALAR(0, 3); - } - STORE_SCALAR(0, 0); - STORE_SCALAR(0, 1); - STORE_SCALAR(0, 2); - STORE_SCALAR(0, 3); - } - } - - for (; i < m2; i+=2) { - j = 0; - - for (; j < n64; j+= 64) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); - - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - - LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); - } - STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); - STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); - } - - for (; j < n32; j+= 32) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - - LOAD_B_512(0, x); LOAD_B_512(1, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); - } - STORE_512(0, 0); STORE_512(1, 0); - STORE_512(0, 1); STORE_512(1, 1); - } - - - for (; j < n16; j+= 16) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - - LOAD_B_512(0, x); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - } - STORE_512(0, 0); - STORE_512(0, 1); - } - - for (; j < n8; j+= 8) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_256(x, 0); - BROADCAST_LOAD_A_256(x, 1); - - LOAD_B_256(0, x); - - MATMUL_256(0, 0); - MATMUL_256(0, 1); - } - STORE_256(0, 0); - STORE_256(0, 1); - } - - for (; j < n4; j+= 4) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_128(x, 0); - BROADCAST_LOAD_A_128(x, 1); - - LOAD_B_128(0, x); - - MATMUL_128(0, 0); - MATMUL_128(0, 1); - } - STORE_128(0, 0); - STORE_128(0, 1); - } - for (; j < n2; j+= 2) { - DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); - DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(x, 0); - BROADCAST_LOAD_A_SCALAR(x, 1); - - LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); - - MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); - MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); - } - STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); - STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); - } - - for (; j < N; j++) { - DECLARE_RESULT_SCALAR(0, 0); - DECLARE_RESULT_SCALAR(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(0, 0); - BROADCAST_LOAD_A_SCALAR(0, 1); - - LOAD_B_SCALAR(0, 0); - - MATMUL_SCALAR(0, 0); - MATMUL_SCALAR(0, 1); - } - STORE_SCALAR(0, 0); - STORE_SCALAR(0, 1); - } - } - - for (; i < M; i+=1) { - j = 0; - for (; j < n64; j+= 64) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); - MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); - } - STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); - } - for (; j < n32; j+= 32) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - LOAD_B_512(0, x); LOAD_B_512(1, x); - MATMUL_512(0, 0); MATMUL_512(1, 0); - } - STORE_512(0, 0); STORE_512(1, 0); - } - - - for (; j < n16; j+= 16) { - DECLARE_RESULT_512(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - - LOAD_B_512(0, x); - - MATMUL_512(0, 0); - } - STORE_512(0, 0); - } - - for (; j < n8; j+= 8) { - DECLARE_RESULT_256(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_256(x, 0); - LOAD_B_256(0, x); - MATMUL_256(0, 0); - } - STORE_256(0, 0); - } - - for (; j < n4; j+= 4) { - DECLARE_RESULT_128(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_128(x, 0); - LOAD_B_128(0, x); - MATMUL_128(0, 0); - } - STORE_128(0, 0); - } - - for (; j < n2; j+= 2) { - DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(x, 0); - LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0); - MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); - } - STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); - } - - for (; j < N; j++) { - DECLARE_RESULT_SCALAR(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(0, 0); - LOAD_B_SCALAR(0, 0); - MATMUL_SCALAR(0, 0); - } - STORE_SCALAR(0, 0); - } - } -} +#include "sgemm_direct_skylakex.c" diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index ee3417505..e4ca6b1bd 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -1,5 +1,5 @@ /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ -/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ +/* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ #include "common.h" #include @@ -53,26 +53,25 @@ #define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;" #define COMPUTE_m16(ndim) \ INIT_m16n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ - "cmpq $18,%4; jb "#ndim"016162f;"\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5; xorq %%r10,%%r10;"\ + "cmpq $16,%4; jb "#ndim"016162f;"\ #ndim"016161:\n\t"\ + "cmpq $126,%%r10; movq $126,%%r10; cmoveq %3,%%r10;"\ KERNEL_k1m16n##ndim\ KERNEL_k1m16n##ndim\ + "prefetcht1 (%5); subq $63,%5; addq %%r10,%5;"\ KERNEL_k1m16n##ndim\ - "prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\ KERNEL_k1m16n##ndim\ - KERNEL_k1m16n##ndim\ - KERNEL_k1m16n##ndim\ - "prefetcht1 (%8); addq $32,%8;"\ - "subq $6,%4; cmpq $18,%4; jnb "#ndim"016161b;"\ + "prefetcht1 (%6); addq $32,%6;"\ + "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\ "movq %2,%5;"\ #ndim"016162:\n\t"\ - "testq %4,%4; jz "#ndim"016163f;"\ + "testq %4,%4; jz "#ndim"016164f;"\ + #ndim"016163:\n\t"\ "prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1);"\ KERNEL_k1m16n##ndim\ - "leaq (%5,%3,2),%5;"\ - "decq %4; jmp "#ndim"016162b;"\ - #ndim"016163:\n\t"\ + "leaq (%5,%3,2),%5; decq %4; jnz "#ndim"016163b;"\ + #ndim"016164:\n\t"\ "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ SAVE_m16(ndim) @@ -212,185 +211,152 @@ #define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955) #define COMPUTE_m4(ndim) COMPUTE_m4_n##ndim -/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ #define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define KERNEL_k1m2n1(b_addr) \ +#define KERNEL_k1m2n1 \ "vmovsd (%0),%%xmm1; addq $8,%0;"\ - "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "addq $4,"#b_addr";" -#define SAVE_L_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" -#define KERNEL_k1m2n2(b_addr) \ +#define KERNEL_k1m2n2 \ "vmovsd (%0),%%xmm1; addq $8,%0;"\ - "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "vbroadcastss 4("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ - "addq $8,"#b_addr";" -#define SAVE_L_m2n2 SAVE_L_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" #define INIT_m2n4 INIT_m2n2 #define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" #define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" -#define KERNEL_k1m2n4(b_addr) \ - "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ - "addq $8,%0;" -#define KERNEL_k1m2n8(b_addr) \ - "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ - "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ - "addq $8,%0;" -#define KERNEL_k1m2n12(b_addr) \ - "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\ - "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ - "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ - "addq $8,%0;" +#define INIT_m2n16 INIT_m2n12 "vpxor %%xmm10,%%xmm10,%%xmm10; vpxor %%xmm11,%%xmm11,%%xmm11;" +#define INIT_m2n20 INIT_m2n16 "vpxor %%xmm12,%%xmm12,%%xmm12; vpxor %%xmm13,%%xmm13,%%xmm13;" +#define INIT_m2n24 INIT_m2n20 "vpxor %%xmm14,%%xmm14,%%xmm14; vpxor %%xmm15,%%xmm15,%%xmm15;" +#define KERNEL_h_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;"\ + "vmovups (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 "vmovups (%1,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%1;" +#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 \ + "vmovups (%1,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm8; vfmadd231ps %%xmm2,%%xmm3,%%xmm9; addq $16,%1;" +#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 "vmovups (%%r15),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm10; vfmadd231ps %%xmm2,%%xmm3,%%xmm11;" +#define KERNEL_k1m2n16 KERNEL_h_k1m2n16 "addq $16,%%r15;" +#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 "vmovups (%%r15,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm12; vfmadd231ps %%xmm2,%%xmm3,%%xmm13;" +#define KERNEL_k1m2n20 KERNEL_h_k1m2n20 "addq $16,%%r15;" +#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 "vmovups (%%r15,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm14; vfmadd231ps %%xmm2,%%xmm3,%%xmm15;" +#define KERNEL_k1m2n24 KERNEL_h_k1m2n24 "addq $16,%%r15;" #define unit_save_m2n4(c1,c2) \ "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ "leaq (%5,%3,2),%5;"\ "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ "leaq (%5,%3,2),%5;" -#define SAVE_L_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) -#define SAVE_L_m2n8 SAVE_L_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) -#define SAVE_L_m2n12 SAVE_L_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) -#define SAVE_R_m2n4 unit_save_m2n4(%%xmm4,%%xmm5) -#define SAVE_R_m2n8 SAVE_R_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) -#define SAVE_R_m2n12 SAVE_R_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) -#define COMPUTE_L_m2(ndim,sim) \ - INIT_m2n##ndim\ - "movq %%r13,%4; movq %%r14,%1;"\ - #ndim""#sim"222:\n\t"\ - "testq %4,%4; jz "#ndim""#sim"223f;"\ - KERNEL_k1m2n##ndim(%1)\ - "decq %4; jmp "#ndim""#sim"222b;"\ - #ndim""#sim"223:\n\t"\ - SAVE_L_m2n##ndim "addq $8,%2;" -#define COMPUTE_R_m2(ndim,sim) \ - "salq $3,%%r13;subq %%r13,%0;sarq $3,%%r13;"\ +#define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define SAVE_h_m2n16 SAVE_h_m2n12 unit_save_m2n4(%%xmm10,%%xmm11) +#define SAVE_h_m2n20 SAVE_h_m2n16 unit_save_m2n4(%%xmm12,%%xmm13) +#define SAVE_h_m2n24 SAVE_h_m2n20 unit_save_m2n4(%%xmm14,%%xmm15) +#define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $8,%2;" +#define COMPUTE_m2(ndim) \ INIT_m2n##ndim\ - "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ - #ndim""#sim"222:\n\t"\ - "testq %4,%4; jz "#ndim""#sim"223f;"\ - KERNEL_k1m2n##ndim(%%r15)\ - "decq %4; jmp "#ndim""#sim"222b;"\ - #ndim""#sim"223:\n\t"\ - SAVE_R_m2n##ndim -#define COMPUTE_m2_n1 COMPUTE_L_m2(1,77877) -#define COMPUTE_m2_n2 COMPUTE_L_m2(2,77877) -#define COMPUTE_m2_n4 COMPUTE_L_m2(4,77877) -#define COMPUTE_m2_n8 COMPUTE_L_m2(8,77877) -#define COMPUTE_m2_n12 COMPUTE_L_m2(12,77877) -#define COMPUTE_m2_n16 COMPUTE_L_m2(12,77777) COMPUTE_R_m2(4,77977) -#define COMPUTE_m2_n20 COMPUTE_L_m2(12,77677) COMPUTE_R_m2(8,77977) -#define COMPUTE_m2_n24 COMPUTE_L_m2(12,77577) COMPUTE_R_m2(12,77977) -#define COMPUTE_m2(ndim) COMPUTE_m2_n##ndim - -/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + "testq %4,%4; jz "#ndim"002022f;"\ + #ndim"002021:\n\t"\ + KERNEL_k1m2n##ndim "decq %4; jnz "#ndim"002021b;"\ + #ndim"002022:\n\t"\ + SAVE_m2(ndim) + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define KERNEL_k1m1n1(b_addr) \ - "vmovss ("#b_addr"),%%xmm3; addq $4,"#b_addr";"\ +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ "addq $4,%0;" -#define SAVE_L_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" #define INIT_m1n2 INIT_m1n1 -#define KERNEL_k1m1n2(b_addr) \ - "vmovsd ("#b_addr"),%%xmm3; addq $8,"#b_addr";"\ +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ "addq $4,%0;" -#define SAVE_L_m1n2 \ +#define SAVE_h_m1n2 \ "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" #define INIT_m1n4 INIT_m1n2 #define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" #define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" -#define KERNEL_k1m1n4(b_addr) \ - "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "addq $4,%0;" -#define KERNEL_k1m1n8(b_addr) \ - "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ - "addq $4,%0;" -#define KERNEL_k1m1n12(b_addr) \ - "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\ - "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ - "addq $4,%0;" +#define INIT_m1n16 INIT_m1n12 "vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m1n20 INIT_m1n16 "vpxor %%xmm8,%%xmm8,%%xmm8;" +#define INIT_m1n24 INIT_m1n20 "vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_h_k1m1n4 \ + "vbroadcastss (%0),%%xmm1; addq $4,%0; vfmadd231ps (%1),%%xmm1,%%xmm4;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_h_k1m1n8 KERNEL_h_k1m1n4 "vfmadd231ps (%1,%%r12,1),%%xmm1,%%xmm5;" +#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%1;" +#define KERNEL_k1m1n12 KERNEL_h_k1m1n8 "vfmadd231ps (%1,%%r12,2),%%xmm1,%%xmm6; addq $16,%1;" +#define KERNEL_h_k1m1n16 KERNEL_k1m1n12 "vfmadd231ps (%%r15),%%xmm1,%%xmm7;" +#define KERNEL_k1m1n16 KERNEL_h_k1m1n16 "addq $16,%%r15;" +#define KERNEL_h_k1m1n20 KERNEL_h_k1m1n16 "vfmadd231ps (%%r15,%%r12,1),%%xmm1,%%xmm8;" +#define KERNEL_k1m1n20 KERNEL_h_k1m1n20 "addq $16,%%r15;" +#define KERNEL_h_k1m1n24 KERNEL_h_k1m1n20 "vfmadd231ps (%%r15,%%r12,2),%%xmm1,%%xmm9;" +#define KERNEL_k1m1n24 KERNEL_h_k1m1n24 "addq $16,%%r15;" #define unit_save_m1n4(c1) \ "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_L_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) -#define SAVE_L_m1n8 SAVE_L_m1n4 unit_save_m1n4(%%xmm5) -#define SAVE_L_m1n12 SAVE_L_m1n8 unit_save_m1n4(%%xmm6) -#define SAVE_R_m1n4 unit_save_m1n4(%%xmm4) -#define SAVE_R_m1n8 SAVE_R_m1n4 unit_save_m1n4(%%xmm5) -#define SAVE_R_m1n12 SAVE_R_m1n8 unit_save_m1n4(%%xmm6) -#define COMPUTE_L_m1(ndim,sim) \ - INIT_m1n##ndim\ - "movq %%r13,%4; movq %%r14,%1;"\ - #ndim""#sim"112:\n\t"\ - "testq %4,%4; jz "#ndim""#sim"113f;"\ - KERNEL_k1m1n##ndim(%1)\ - "decq %4; jmp "#ndim""#sim"112b;"\ - #ndim""#sim"113:\n\t"\ - SAVE_L_m1n##ndim "addq $4,%2;" -#define COMPUTE_R_m1(ndim,sim) \ - "salq $2,%%r13;subq %%r13,%0;sarq $2,%%r13;"\ +#define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%xmm6) +#define SAVE_h_m1n16 SAVE_h_m1n12 unit_save_m1n4(%%xmm7) +#define SAVE_h_m1n20 SAVE_h_m1n16 unit_save_m1n4(%%xmm8) +#define SAVE_h_m1n24 SAVE_h_m1n20 unit_save_m1n4(%%xmm9) +#define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $4,%2;" +#define COMPUTE_m1(ndim) \ INIT_m1n##ndim\ - "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ - #ndim""#sim"112:\n\t"\ - "testq %4,%4; jz "#ndim""#sim"113f;"\ - KERNEL_k1m1n##ndim(%%r15)\ - "decq %4; jmp "#ndim""#sim"112b;"\ - #ndim""#sim"113:\n\t"\ - SAVE_R_m1n##ndim -#define COMPUTE_m1_n1 COMPUTE_L_m1(1,99899) -#define COMPUTE_m1_n2 COMPUTE_L_m1(2,99899) -#define COMPUTE_m1_n4 COMPUTE_L_m1(4,99899) -#define COMPUTE_m1_n8 COMPUTE_L_m1(8,99899) -#define COMPUTE_m1_n12 COMPUTE_L_m1(12,99899) -#define COMPUTE_m1_n16 COMPUTE_L_m1(12,99799) COMPUTE_R_m1(4,99999) -#define COMPUTE_m1_n20 COMPUTE_L_m1(12,99699) COMPUTE_R_m1(8,99999) -#define COMPUTE_m1_n24 COMPUTE_L_m1(12,99599) COMPUTE_R_m1(12,99999) -#define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + "testq %4,%4; jz "#ndim"001012f;"\ + #ndim"001011:\n\t"\ + KERNEL_k1m1n##ndim "decq %4; jnz "#ndim"001011b;"\ + #ndim"001012:\n\t"\ + SAVE_m1(ndim) /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ -/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */ -/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ +/* %6 = "+r"(next_b), %7 = "m"(ALPHA), %8 = "m"(M) */ +/* r11 = m_counter, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ #define COMPUTE(ndim) {\ next_b = b_pointer + ndim * K;\ __asm__ __volatile__(\ - "vbroadcastss (%6),%%zmm0;"\ - "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ - "cmpq $16,%7;jb 33101"#ndim"f;"\ + "vbroadcastss %7,%%zmm0;"\ + "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\ + "cmpq $16,%%r11;jb 33101"#ndim"f;"\ "33109"#ndim":\n\t"\ COMPUTE_m16(ndim)\ - "subq $16,%7;cmpq $16,%7;jnb 33109"#ndim"b;"\ + "subq $16,%%r11;cmpq $16,%%r11;jnb 33109"#ndim"b;"\ "33101"#ndim":\n\t"\ - "cmpq $8,%7;jb 33102"#ndim"f;"\ + "cmpq $8,%%r11;jb 33102"#ndim"f;"\ COMPUTE_m8(ndim)\ - "subq $8,%7;"\ + "subq $8,%%r11;"\ "33102"#ndim":\n\t"\ - "cmpq $4,%7;jb 33103"#ndim"f;"\ + "cmpq $4,%%r11;jb 33103"#ndim"f;"\ COMPUTE_m4(ndim)\ - "subq $4,%7;"\ + "subq $4,%%r11;"\ "33103"#ndim":\n\t"\ - "cmpq $2,%7;jb 33104"#ndim"f;"\ + "cmpq $2,%%r11;jb 33104"#ndim"f;"\ COMPUTE_m2(ndim)\ - "subq $2,%7;"\ + "subq $2,%%r11;"\ "33104"#ndim":\n\t"\ - "testq %7,%7;jz 33105"#ndim"f;"\ + "testq %%r11,%%r11;jz 33105"#ndim"f;"\ COMPUTE_m1(ndim)\ "33105"#ndim":\n\t"\ - "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M),"+r"(next_b)\ - ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ + "movq %%r13,%4; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(next_b):"m"(ALPHA),"m"(M)\ + :"r10","r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ "cc","memory");\ - a_pointer -= M * K; b_pointer += ndim * K;c_pointer += LDC * ndim - M;\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M;\ } int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) @@ -399,7 +365,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha; int64_t M = (int64_t)m, K = (int64_t)k; BLASLONG n_count = n; - float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA,*next_b = B; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; for(;n_count>23;n_count-=24) COMPUTE(24) for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) @@ -411,469 +377,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; } -#include -/* codes below are copied from the sgemm kernel written by Arjan van der Ven */ - -/* - * "Direct sgemm" code. This code operates directly on the inputs and outputs - * of the sgemm call, avoiding the copies, memory realignments and threading, - * and only supports alpha = 1 and beta = 0. - * This is a common case and provides value for relatively small matrixes. - * For larger matrixes the "regular" sgemm code is superior, there the cost of - * copying/shuffling the B matrix really pays off. - */ - - - -#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps() -#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) -#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)]) -#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M) -#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M) - - -#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps() -#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) -#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)]) -#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M) -#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M) - -#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps() -#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)])) -#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)]) -#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M) -#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M) - -#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0; -#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)]; -#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N]; -#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; -#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; - -int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) -{ - unsigned long long mnk = M * N * K; - /* large matrixes -> not performant */ - if (mnk >= 28 * 512 * 512) - return 0; - - /* - * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, - * and the regular sgemm copy/realignment of data pays off much quicker - */ - if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) - return 0; - -#ifdef SMP - /* if we can run multithreaded, the threading changes the based threshold */ - if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) - return 0; -#endif - - return 1; -} - - - -void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) -{ - int i, j, k; - - int m4 = M & ~3; - int m2 = M & ~1; - - int n64 = N & ~63; - int n32 = N & ~31; - int n16 = N & ~15; - int n8 = N & ~7; - int n4 = N & ~3; - int n2 = N & ~1; - - i = 0; - - for (i = 0; i < m4; i+=4) { - - for (j = 0; j < n64; j+= 64) { - k = 0; - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); - DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); - DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); - - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - BROADCAST_LOAD_A_512(x, 2); - BROADCAST_LOAD_A_512(x, 3); - - LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); - MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); - MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); - } - STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); - STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); - STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); - STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); - } - - for (; j < n32; j+= 32) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); - DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); - DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - BROADCAST_LOAD_A_512(x, 2); - BROADCAST_LOAD_A_512(x, 3); - - LOAD_B_512(0, x); LOAD_B_512(1, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); - MATMUL_512(0, 2); MATMUL_512(1, 2); - MATMUL_512(0, 3); MATMUL_512(1, 3); - } - STORE_512(0, 0); STORE_512(1, 0); - STORE_512(0, 1); STORE_512(1, 1); - STORE_512(0, 2); STORE_512(1, 2); - STORE_512(0, 3); STORE_512(1, 3); - } - - for (; j < n16; j+= 16) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - DECLARE_RESULT_512(0, 2); - DECLARE_RESULT_512(0, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - BROADCAST_LOAD_A_512(x, 2); - BROADCAST_LOAD_A_512(x, 3); - - LOAD_B_512(0, x); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - MATMUL_512(0, 2); - MATMUL_512(0, 3); - } - STORE_512(0, 0); - STORE_512(0, 1); - STORE_512(0, 2); - STORE_512(0, 3); - } - - for (; j < n8; j+= 8) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - DECLARE_RESULT_256(0, 2); - DECLARE_RESULT_256(0, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_256(x, 0); - BROADCAST_LOAD_A_256(x, 1); - BROADCAST_LOAD_A_256(x, 2); - BROADCAST_LOAD_A_256(x, 3); - - LOAD_B_256(0, x); - - MATMUL_256(0, 0); - MATMUL_256(0, 1); - MATMUL_256(0, 2); - MATMUL_256(0, 3); - } - STORE_256(0, 0); - STORE_256(0, 1); - STORE_256(0, 2); - STORE_256(0, 3); - } - - for (; j < n4; j+= 4) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - DECLARE_RESULT_128(0, 2); - DECLARE_RESULT_128(0, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_128(x, 0); - BROADCAST_LOAD_A_128(x, 1); - BROADCAST_LOAD_A_128(x, 2); - BROADCAST_LOAD_A_128(x, 3); - - LOAD_B_128(0, x); - - MATMUL_128(0, 0); - MATMUL_128(0, 1); - MATMUL_128(0, 2); - MATMUL_128(0, 3); - } - STORE_128(0, 0); - STORE_128(0, 1); - STORE_128(0, 2); - STORE_128(0, 3); - } - - for (; j < n2; j+= 2) { - DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); - DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); - DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2); - DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(x, 0); - BROADCAST_LOAD_A_SCALAR(x, 1); - BROADCAST_LOAD_A_SCALAR(x, 2); - BROADCAST_LOAD_A_SCALAR(x, 3); - - LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); - - MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); - MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); - MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2); - MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3); - } - STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); - STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); - STORE_SCALAR(0, 2); STORE_SCALAR(1, 2); - STORE_SCALAR(0, 3); STORE_SCALAR(1, 3); - } - - for (; j < N; j++) { - DECLARE_RESULT_SCALAR(0, 0) - DECLARE_RESULT_SCALAR(0, 1) - DECLARE_RESULT_SCALAR(0, 2) - DECLARE_RESULT_SCALAR(0, 3) - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(0, 0); - BROADCAST_LOAD_A_SCALAR(0, 1); - BROADCAST_LOAD_A_SCALAR(0, 2); - BROADCAST_LOAD_A_SCALAR(0, 3); - - LOAD_B_SCALAR(0, 0); - - MATMUL_SCALAR(0, 0); - MATMUL_SCALAR(0, 1); - MATMUL_SCALAR(0, 2); - MATMUL_SCALAR(0, 3); - } - STORE_SCALAR(0, 0); - STORE_SCALAR(0, 1); - STORE_SCALAR(0, 2); - STORE_SCALAR(0, 3); - } - } - - for (; i < m2; i+=2) { - j = 0; - - for (; j < n64; j+= 64) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); - - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - - LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); - } - STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); - STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); - } - - for (; j < n32; j+= 32) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - - LOAD_B_512(0, x); LOAD_B_512(1, x); - - MATMUL_512(0, 0); MATMUL_512(1, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); - } - STORE_512(0, 0); STORE_512(1, 0); - STORE_512(0, 1); STORE_512(1, 1); - } - - - for (; j < n16; j+= 16) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - BROADCAST_LOAD_A_512(x, 1); - - LOAD_B_512(0, x); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - } - STORE_512(0, 0); - STORE_512(0, 1); - } - - for (; j < n8; j+= 8) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_256(x, 0); - BROADCAST_LOAD_A_256(x, 1); - - LOAD_B_256(0, x); - - MATMUL_256(0, 0); - MATMUL_256(0, 1); - } - STORE_256(0, 0); - STORE_256(0, 1); - } - - for (; j < n4; j+= 4) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_128(x, 0); - BROADCAST_LOAD_A_128(x, 1); - - LOAD_B_128(0, x); - - MATMUL_128(0, 0); - MATMUL_128(0, 1); - } - STORE_128(0, 0); - STORE_128(0, 1); - } - for (; j < n2; j+= 2) { - DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); - DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(x, 0); - BROADCAST_LOAD_A_SCALAR(x, 1); - - LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x); - - MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); - MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1); - } - STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); - STORE_SCALAR(0, 1); STORE_SCALAR(1, 1); - } - - for (; j < N; j++) { - DECLARE_RESULT_SCALAR(0, 0); - DECLARE_RESULT_SCALAR(0, 1); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(0, 0); - BROADCAST_LOAD_A_SCALAR(0, 1); - - LOAD_B_SCALAR(0, 0); - - MATMUL_SCALAR(0, 0); - MATMUL_SCALAR(0, 1); - } - STORE_SCALAR(0, 0); - STORE_SCALAR(0, 1); - } - } - - for (; i < M; i+=1) { - j = 0; - for (; j < n64; j+= 64) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x); - MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); - } - STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); - } - for (; j < n32; j+= 32) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - LOAD_B_512(0, x); LOAD_B_512(1, x); - MATMUL_512(0, 0); MATMUL_512(1, 0); - } - STORE_512(0, 0); STORE_512(1, 0); - } - - - for (; j < n16; j+= 16) { - DECLARE_RESULT_512(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_512(x, 0); - - LOAD_B_512(0, x); - - MATMUL_512(0, 0); - } - STORE_512(0, 0); - } - - for (; j < n8; j+= 8) { - DECLARE_RESULT_256(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_256(x, 0); - LOAD_B_256(0, x); - MATMUL_256(0, 0); - } - STORE_256(0, 0); - } - - for (; j < n4; j+= 4) { - DECLARE_RESULT_128(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_128(x, 0); - LOAD_B_128(0, x); - MATMUL_128(0, 0); - } - STORE_128(0, 0); - } - - for (; j < n2; j+= 2) { - DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(x, 0); - LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0); - MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0); - } - STORE_SCALAR(0, 0); STORE_SCALAR(1, 0); - } - - for (; j < N; j++) { - DECLARE_RESULT_SCALAR(0, 0); - - for (k = 0; k < K; k++) { - BROADCAST_LOAD_A_SCALAR(0, 0); - LOAD_B_SCALAR(0, 0); - MATMUL_SCALAR(0, 0); - } - STORE_SCALAR(0, 0); - } - } -} +#include "sgemm_direct_skylakex.c" From 952cc2ba3860419defed3c27af1c3becca9e40e9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 13 Jan 2020 16:58:54 +0800 Subject: [PATCH 852/935] Update sgemm_kernel_16x4_skylakex_2.c --- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index e4ca6b1bd..6ca822b91 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -376,5 +376,5 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f if(n_count>0) COMPUTE(1) return 0; } - +#include #include "sgemm_direct_skylakex.c" From e5dcdeb5506a8e0ab26e0956c5b8e7fed7e80e9a Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 13 Jan 2020 16:59:23 +0800 Subject: [PATCH 853/935] Update sgemm_direct_skylakex.c --- kernel/x86_64/sgemm_direct_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index 4f9af6e57..0e8f1318f 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,6 +1,6 @@ /* the direct sgemm code written by Arjan van der Ven */ -#include +//#include /* * "Direct sgemm" code. This code operates directly on the inputs and outputs From 78100b80935753a7a86c6a5380e2a53bc9469b7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jan 2020 15:06:39 +0100 Subject: [PATCH 854/935] Free Windows thread memory with MEM_RELEASE rather than MEM_DECOMMIT as suggested by hjmndv in #2370 --- driver/others/memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 55dce72b8..62a5a0214 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -822,7 +822,7 @@ static void *alloc_qalloc(void *address){ static void alloc_windows_free(struct alloc_t *alloc_info){ - VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT); + VirtualFree(alloc_info, 0, MEM_RELEASE); } @@ -935,7 +935,7 @@ static void alloc_hugetlb_free(struct alloc_t *alloc_info){ #ifdef OS_WINDOWS - VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE); #endif @@ -2310,7 +2310,7 @@ static void *alloc_qalloc(void *address){ static void alloc_windows_free(struct release_t *release){ - VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + VirtualFree(release -> address, 0, MEM_RELEASE); } @@ -2432,7 +2432,7 @@ static void alloc_hugetlb_free(struct release_t *release){ #ifdef OS_WINDOWS - VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE); #endif From 23f322f997c8b018977be24122c56fb62d728a05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Jan 2020 13:28:27 +0100 Subject: [PATCH 855/935] Do not run any cleanup if the program is exiting anyway From keno's PR #2350 - this avoids the potential hang in blas_thread_shutdown where we may wait for threads to exit while they are waiting on the loader lock from DllMain --- exports/dllinit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/exports/dllinit.c b/exports/dllinit.c index 4a05c0e14..88f9af658 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -50,7 +50,10 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { gotoblas_init(); break; case DLL_PROCESS_DETACH: - gotoblas_quit(); + // If the process is about to exit, don't bother releasing any resources + // The kernel is much better at bulk releasing then. + if (!reserved) + gotoblas_quit(); break; case DLL_THREAD_ATTACH: break; From ff42e68652fbba58936c9c66d0b060c3a6d694e7 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 20 Jan 2020 11:49:42 +0800 Subject: [PATCH 856/935] Optimize genenal Gemm Beta --- kernel/generic/gemm_beta.c | 132 ++++++++++++------------------------- 1 file changed, 42 insertions(+), 90 deletions(-) diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c index c4e4f7abe..fa9d7680d 100644 --- a/kernel/generic/gemm_beta.c +++ b/kernel/generic/gemm_beta.c @@ -42,101 +42,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ + BLASLONG i, j; + BLASLONG chunk, remain; FLOAT *c_offset1, *c_offset; - FLOAT ctemp1, ctemp2, ctemp3, ctemp4; - FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - c_offset = c; - + chunk = m >> 3; + remain = m & 7; if (beta == ZERO){ - - j = n; - do { - c_offset1 = c_offset; - c_offset += ldc; - - i = (m >> 3); - if (i > 0){ - do { - *(c_offset1 + 0) = ZERO; - *(c_offset1 + 1) = ZERO; - *(c_offset1 + 2) = ZERO; - *(c_offset1 + 3) = ZERO; - *(c_offset1 + 4) = ZERO; - *(c_offset1 + 5) = ZERO; - *(c_offset1 + 6) = ZERO; - *(c_offset1 + 7) = ZERO; - c_offset1 += 8; - i --; - } while (i > 0); - } - - i = (m & 7); - if (i > 0){ - do { - *c_offset1 = ZERO; - c_offset1 ++; - i --; - } while (i > 0); - } - j --; - } while (j > 0); - + for(j=n; j>0; j--){ + c_offset1 = c_offset; + c_offset += ldc; + for(i=chunk; i>0; i--){ + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + *(c_offset1 + 2) = ZERO; + *(c_offset1 + 3) = ZERO; + *(c_offset1 + 4) = ZERO; + *(c_offset1 + 5) = ZERO; + *(c_offset1 + 6) = ZERO; + *(c_offset1 + 7) = ZERO; + c_offset1 += 8; + } + for(i=remain; i>0; i--){ + *c_offset1 = ZERO; + c_offset1 ++; + } + } } else { - - j = n; - do { - c_offset1 = c_offset; - c_offset += ldc; - - i = (m >> 3); - if (i > 0){ - do { - ctemp1 = *(c_offset1 + 0); - ctemp2 = *(c_offset1 + 1); - ctemp3 = *(c_offset1 + 2); - ctemp4 = *(c_offset1 + 3); - ctemp5 = *(c_offset1 + 4); - ctemp6 = *(c_offset1 + 5); - ctemp7 = *(c_offset1 + 6); - ctemp8 = *(c_offset1 + 7); - - ctemp1 *= beta; - ctemp2 *= beta; - ctemp3 *= beta; - ctemp4 *= beta; - ctemp5 *= beta; - ctemp6 *= beta; - ctemp7 *= beta; - ctemp8 *= beta; - - *(c_offset1 + 0) = ctemp1; - *(c_offset1 + 1) = ctemp2; - *(c_offset1 + 2) = ctemp3; - *(c_offset1 + 3) = ctemp4; - *(c_offset1 + 4) = ctemp5; - *(c_offset1 + 5) = ctemp6; - *(c_offset1 + 6) = ctemp7; - *(c_offset1 + 7) = ctemp8; - c_offset1 += 8; - i --; - } while (i > 0); - } - - i = (m & 7); - if (i > 0){ - do { - ctemp1 = *c_offset1; - ctemp1 *= beta; - *c_offset1 = ctemp1; - c_offset1 ++; - i --; - } while (i > 0); - } - j --; - } while (j > 0); - + for(j=n; j>0; j--){ + c_offset1 = c_offset; + c_offset += ldc; + for(i=chunk; i>0; i--){ + *(c_offset1 + 0) *= beta; + *(c_offset1 + 1) *= beta; + *(c_offset1 + 2) *= beta; + *(c_offset1 + 3) *= beta; + *(c_offset1 + 4) *= beta; + *(c_offset1 + 5) *= beta; + *(c_offset1 + 6) *= beta; + *(c_offset1 + 7) *= beta; + c_offset1 += 8; + } + for(i=remain; i>0; i--){ + *c_offset1 *= beta; + c_offset1 ++; + } + } } return 0; }; From fbf4f48f4a3d324dd268aaad51624022ee4f0ea2 Mon Sep 17 00:00:00 2001 From: "Wang,Long" Date: Wed, 22 Jan 2020 15:07:50 +0000 Subject: [PATCH 857/935] fix a few performance drop in some matrix size per data type Signed-off-by: Wang,Long --- param.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index 3baae31cf..075c12ca2 100644 --- a/param.h +++ b/param.h @@ -1507,8 +1507,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 32 -#define GEMM_PREFERED_SIZE 16 +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 4 +#define GEMM_PREFERED_SIZE 4 +#else +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#endif #ifdef ARCH_X86 @@ -1627,8 +1632,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 32 -#define GEMM_PREFERED_SIZE 32 +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#else +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 +#endif #define USE_SGEMM_KERNEL_DIRECT 1 #ifdef ARCH_X86 From e9fb8f62b1822c456ccc0b9db23f49aa66dd6801 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 22 Jan 2020 17:40:03 +0000 Subject: [PATCH 858/935] Update level3_gemm3m_thread.c --- driver/level3/level3_gemm3m_thread.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 21d431b60..9216daaed 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -104,7 +104,7 @@ typedef struct { #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \ - (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #endif #ifndef ICOPYB_OPERATION @@ -414,7 +414,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -550,7 +550,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -687,7 +687,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); From 8dc9fd4dfeb894d8b7553c8e5fcc991917335557 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jan 2020 12:41:18 +0100 Subject: [PATCH 859/935] Add -march option for AVX512 --- cmake/cc.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 37da0d6ed..22217575c 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN") endif () endif () +if (${CORE} STREQUAL "SKYLAKEX") + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512") + endif () + endif () +endif () From 8019e70211f5e6679e1e5afd5658016d8045de19 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 3 Feb 2020 21:32:56 +0800 Subject: [PATCH 860/935] AVX512 16x2 DGEMM kernel --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 488 +++++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c new file mode 100644 index 000000000..250ff8d49 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -0,0 +1,488 @@ +#if (defined (LEFT) && !defined(TRANSA)) || (!defined (LEFT) && defined(TRANSA)) + #define BACKWARDS 1 +#else + #define BACKWARDS 0 +#endif +#define GEMM_SET_PB "movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;" +#define set_p_copy1(ptr) "sarq $1,%%r12; addq %%r12,"#ptr"; salq $1,%%r12; salq $3,%%r13; subq %%r13,"#ptr"; sarq $3,%%r13;" +#define set_p_copy2(ptr) "addq %%r12,"#ptr"; salq $4,%%r13; subq %%r13,"#ptr"; sarq $4,%%r13;" +#define set_p_copy4(ptr) "leaq ("#ptr",%%r12,2),"#ptr"; salq $5,%%r13; subq %%r13,"#ptr"; sarq $5,%%r13;" +#define set_p_copy8(ptr) "leaq ("#ptr",%%r12,4),"#ptr"; salq $6,%%r13; subq %%r13,"#ptr"; sarq $6,%%r13;" +#define set_p_copy16(ptr) "leaq ("#ptr",%%r12,8),"#ptr"; salq $7,%%r13; subq %%r13,"#ptr"; sarq $7,%%r13;" +#define set_p_b_dim1(ptr) set_p_copy1(ptr) +#define set_p_b_dim2(ptr) set_p_copy2(ptr) +#define set_p_b_dim4(ptr) set_p_copy2(ptr) +#define set_p_b_dim6(ptr) set_p_copy2(ptr) +#define set_p_b_dim8(ptr) set_p_copy2(ptr) +#define set_p_b_dim10(ptr) set_p_copy2(ptr) +#define set_p_b_dim12(ptr) set_p_copy2(ptr) +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define INIT_set_papb(mdim,ndim) GEMM_SET_PB set_p_copy##mdim(%0) set_p_b_dim##ndim(%1) set_p_b_dim##ndim(%%r15) + #define SAVE_set_pa(mdim) "" + #else + #define INIT_set_papb(mdim,ndim) GEMM_SET_PB + #define SAVE_set_pa(mdim) set_p_copy##mdim(%0) + #endif +#else + #define INIT_set_papb(mdim,ndim) GEMM_SET_PB + #define SAVE_set_pa(mdim) "" +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + #if BACKWARDS == 1 + #define HEAD_SET_OFF(ndim) {} + #define TAIL_SET_OFF(ndim) {off += ndim;} + #define kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n2 KERNEL_k1m##mdim##n2 "addq $32,%%r15; "#updk" $2,%5;" + #define kernel_kstart_n6(mdim,updk) kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "addq $32,%%r15; "#updk" $2,%5;" + #define kernel_kstart_n8(mdim,updk) kernel_kstart_n6(mdim,updk) KERNEL_k1m##mdim##n6 KERNEL_k1m##mdim##n6 "addq $32,%%r15; "#updk" $2,%5;" + #define kernel_kstart_n10(mdim,updk) kernel_kstart_n8(mdim,updk) KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 #updk" $2,%5;" + #define kernel_kstart_n12(mdim,updk) kernel_kstart_n10(mdim,updk) KERNEL_k1m##mdim##n10 KERNEL_k1m##mdim##n10 #updk" $2,%5;" + #define kernel_kend_n4(mdim) "" + #define kernel_kend_n6(mdim) "" + #define kernel_kend_n8(mdim) "" + #define kernel_kend_n10(mdim) "" + #define kernel_kend_n12(mdim) "" + #else + #define HEAD_SET_OFF(ndim) {off += (ndim > 2 ? 2 : ndim);} + #define TAIL_SET_OFF(ndim) {off += (ndim > 2 ? (ndim-2) : 0);} + #define kernel_kstart_n4(mdim,updk) "" + #define kernel_kstart_n6(mdim,updk) "" + #define kernel_kstart_n8(mdim,updk) "" + #define kernel_kstart_n10(mdim,updk) "" + #define kernel_kstart_n12(mdim,updk) "" + #define kernel_kend_n4(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) + #define kernel_kend_n6(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) + #define kernel_kend_n8(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) + #define kernel_kend_n10(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) + #define kernel_kend_n12(mdim) "xorq %3,%3;"\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152) + #endif +#else + #define HEAD_SET_OFF(ndim) {} + #define TAIL_SET_OFF(ndim) {} + #define kernel_kstart_n4(mdim,updk) "" + #define kernel_kstart_n6(mdim,updk) "" + #define kernel_kstart_n8(mdim,updk) "" + #define kernel_kstart_n10(mdim,updk) "" + #define kernel_kstart_n12(mdim,updk) "" + #define kernel_kend_n4(mdim) "" + #define kernel_kend_n6(mdim) "" + #define kernel_kend_n8(mdim) "" + #define kernel_kend_n10(mdim) "" + #define kernel_kend_n12(mdim) "" +#endif +#define kernel_kstart_n1(mdim,updk) "" +#define kernel_kstart_n2(mdim,updk) "" +#define kernel_kend_n1(mdim) "" +#define kernel_kend_n2(mdim) "" + +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define INITASM_SET_K "movq %10,%%r13; subq %9,%%r13;" + #else + #define INITASM_SET_K "movq %9,%%r13;" + #endif +#else + #define INITASM_SET_K "movq %10,%%r13;" +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + #if BACKWARDS==1 + #define init_update_k(mdim) "" + #define save_update_k(mdim) "subq $"#mdim",%%r13;" + #else + #define init_update_k(mdim) "addq $"#mdim",%%r13;" + #define save_update_k(mdim) "" + #endif +#else + #define init_update_k(mdim) "" + #define save_update_k(mdim) "" +#endif + +#define KERNEL_h_k1m16n1 \ + "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\ + "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;" +#define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;" +#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ + "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;" +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" +#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\ + "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\ + "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" +#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__) +#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) +#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" +#define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m16n8 KERNEL_k1m16n6 "prefetcht0 448(%0);" unit_acc_m16n2(20,21,22,23,%%r15) +#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%%r15;" +#define KERNEL_h_k1m16n10 KERNEL_h_k1m16n8 unit_acc_m16n2(24,25,26,27,%%r15,%%r12,1) +#define KERNEL_k1m16n10 KERNEL_h_k1m16n10 "addq $16,%%r15;" +#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2) +#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;" + #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1) + #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2) + #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15) + #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1) + #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2) +#endif +#define save_init_m16 "movq %2,%3; addq $128,%2;" +#ifdef TRMMKERNEL + #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ + "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ + "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11) +#define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15) +#define SAVE_m16n6 SAVE_m16n4 unit_save_m16n2(16,17,18,19) +#define SAVE_m16n8 SAVE_m16n6 unit_save_m16n2(20,21,22,23) +#define SAVE_m16n10 SAVE_m16n8 unit_save_m16n2(24,25,26,27) +#define SAVE_m16n12 SAVE_m16n10 unit_save_m16n2(28,29,30,31) +#define unit_init_2zmm(c1_no,c2_no) "vpxorq %%zmm"#c1_no",%%zmm"#c1_no",%%zmm"#c1_no"; vpxorq %%zmm"#c2_no",%%zmm"#c2_no",%%zmm"#c2_no";" +#define unit_init_4zmm(c1_no,c2_no,c3_no,c4_no) unit_init_2zmm(c1_no,c2_no) unit_init_2zmm(c3_no,c4_no) +#define INIT_m16n1 unit_init_2zmm(8,9) +#define INIT_m16n2 unit_init_4zmm(8,9,10,11) +#define INIT_m16n4 INIT_m16n2 unit_init_4zmm(12,13,14,15) +#define INIT_m16n6 INIT_m16n4 unit_init_4zmm(16,17,18,19) +#define INIT_m16n8 INIT_m16n6 unit_init_4zmm(20,21,22,23) +#define INIT_m16n10 INIT_m16n8 unit_init_4zmm(24,25,26,27) +#define INIT_m16n12 INIT_m16n10 unit_init_4zmm(28,29,30,31) + +#define KERNEL_k1m8n1 \ + "vbroadcastsd (%1),%%zmm1; addq $8,%1;"\ + "vfmadd231pd (%0),%%zmm1,%%zmm8; addq $64,%0;" +#define unit_acc_gen_m8n2(c1_no,c2_no,boff,...)\ + "vbroadcastf32x4 "#boff"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" +#define unit_acc_m8n2(c1_no,c2_no,...) unit_acc_gen_m8n2(c1_no,c2_no,0,__VA_ARGS__) +#define KERNEL_h_k1m8n2 \ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" unit_acc_m8n2(8,9,%1) +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $16,%1;" +#define KERNEL_h_k1m8n4 KERNEL_h_k1m8n2 unit_acc_m8n2(10,11,%1,%%r12,1) +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define KERNEL_k1m8n6 KERNEL_h_k1m8n4 unit_acc_m8n2(12,13,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m8n8 KERNEL_k1m8n6 unit_acc_m8n2(14,15,%%r15) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%%r15;" +#define KERNEL_h_k1m8n10 KERNEL_h_k1m8n8 unit_acc_m8n2(16,17,%%r15,%%r12,1) +#define KERNEL_k1m8n10 KERNEL_h_k1m8n10 "addq $16,%%r15;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n10 unit_acc_m8n2(18,19,%%r15,%%r12,2) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;" + #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15) + #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) +#endif +#define save_init_m8 "movq %2,%3; addq $64,%2;" +#ifdef TRMMKERNEL + #define SAVE_m8n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;" + #define unit_save_m8n2(c1_no,c2_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\ + "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m8n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;" + #define unit_save_m8n2(c1_no,c2_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\ + "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vfmadd213pd (%3,%4,1),%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m8n2 save_init_m8 unit_save_m8n2(8,9) +#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(10,11) +#define SAVE_m8n6 SAVE_m8n4 unit_save_m8n2(12,13) +#define SAVE_m8n8 SAVE_m8n6 unit_save_m8n2(14,15) +#define SAVE_m8n10 SAVE_m8n8 unit_save_m8n2(16,17) +#define SAVE_m8n12 SAVE_m8n10 unit_save_m8n2(18,19) +#define INIT_m8n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;" +#define INIT_m8n2 unit_init_2zmm(8,9) +#define INIT_m8n4 INIT_m8n2 unit_init_2zmm(10,11) +#define INIT_m8n6 INIT_m8n4 unit_init_2zmm(12,13) +#define INIT_m8n8 INIT_m8n6 unit_init_2zmm(14,15) +#define INIT_m8n10 INIT_m8n8 unit_init_2zmm(16,17) +#define INIT_m8n12 INIT_m8n10 unit_init_2zmm(18,19) + +#define KERNEL_k1m4n1 \ + "vbroadcastsd (%1),%%ymm1; addq $8,%1;"\ + "vfmadd231pd (%0),%%ymm1,%%ymm4; addq $32,%0;" +#define unit_acc_gen_m4n2(c1_no,c2_no,boff,...)\ + "vbroadcastf128 "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm"#c1_no"; vfmadd231pd %%ymm2,%%ymm3,%%ymm"#c2_no";" +#define unit_acc_m4n2(c1_no,c2_no,...) unit_acc_gen_m4n2(c1_no,c2_no,0,__VA_ARGS__) +#define KERNEL_h_k1m4n2 \ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;" unit_acc_m4n2(4,5,%1) +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_acc_m4n2(6,7,%1,%%r12,1) +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define KERNEL_k1m4n6 KERNEL_h_k1m4n4 unit_acc_m4n2(8,9,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m4n8 KERNEL_k1m4n6 unit_acc_m4n2(10,11,%%r15) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%%r15;" +#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1) +#define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" + #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) +#endif +#define save_init_m4 "movq %2,%3; addq $32,%2;" +#ifdef TRMMKERNEL + #define SAVE_m4n1 "vmulpd %%ymm4,%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;" + #define unit_save_m4n2(c1_no,c2_no)\ + "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vmulpd %%ymm1,%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\ + "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vmulpd %%ymm2,%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m4n1 "vfmadd213pd (%2),%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;" + #define unit_save_m4n2(c1_no,c2_no)\ + "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vfmadd213pd (%3),%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\ + "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vfmadd213pd (%3,%4,1),%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m4n2 save_init_m4 unit_save_m4n2(4,5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(6,7) +#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(8,9) +#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(10,11) +#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(12,13) +#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(14,15) +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define unit_init_2ymm(c1_no,c2_no) "vpxor %%ymm"#c1_no",%%ymm"#c1_no",%%ymm"#c1_no"; vpxor %%ymm"#c2_no",%%ymm"#c2_no",%%ymm"#c2_no";" +#define INIT_m4n2 unit_init_2ymm(4,5) +#define INIT_m4n4 INIT_m4n2 unit_init_2ymm(6,7) +#define INIT_m4n6 INIT_m4n4 unit_init_2ymm(8,9) +#define INIT_m4n8 INIT_m4n6 unit_init_2ymm(10,11) +#define INIT_m4n10 INIT_m4n8 unit_init_2ymm(12,13) +#define INIT_m4n12 INIT_m4n10 unit_init_2ymm(14,15) + +#define KERNEL_k1m2n1 \ + "vmovddup (%1),%%xmm1; addq $8,%1;"\ + "vfmadd231pd (%0),%%xmm1,%%xmm4; addq $16,%0;" +#define unit_acc_gen_m2n2(c1_no,c2_no,boff,...)\ + "vmovupd "#boff"("#__VA_ARGS__"),%%xmm3; vfmadd231pd %%xmm1,%%xmm3,%%xmm"#c1_no"; vfmadd231pd %%xmm2,%%xmm3,%%xmm"#c2_no";" +#define unit_acc_m2n2(c1_no,c2_no,...) unit_acc_gen_m2n2(c1_no,c2_no,0,__VA_ARGS__) +#define KERNEL_h_k1m2n2 \ + "vmovddup (%0),%%xmm1; vmovddup 8(%0),%%xmm2; addq $16,%0;" unit_acc_m2n2(4,5,%1) +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_acc_m2n2(6,7,%1,%%r12,1) +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n4 unit_acc_m2n2(8,9,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m2n8 KERNEL_k1m2n6 unit_acc_m2n2(10,11,%%r15) +#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%%r15;" +#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1) +#define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;" +#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2) +#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" + #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) +#endif +#define save_init_m2 "movq %2,%3; addq $16,%2;" +#ifdef TRMMKERNEL + #define SAVE_m2n1 "vmulpd %%xmm4,%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;" + #define unit_save_m2n2(c1_no,c2_no)\ + "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vmulpd %%xmm1,%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\ + "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vmulpd %%xmm2,%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m2n1 "vfmadd213pd (%2),%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;" + #define unit_save_m2n2(c1_no,c2_no)\ + "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vfmadd213pd (%3),%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\ + "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vfmadd213pd (%3,%4,1),%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m2n2 save_init_m2 unit_save_m2n2(4,5) +#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(6,7) +#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(8,9) +#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(10,11) +#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(12,13) +#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(14,15) +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define unit_init_2xmm(c1_no,c2_no) "vpxor %%xmm"#c1_no",%%xmm"#c1_no",%%xmm"#c1_no"; vpxor %%xmm"#c2_no",%%xmm"#c2_no",%%xmm"#c2_no";" +#define INIT_m2n2 unit_init_2xmm(4,5) +#define INIT_m2n4 INIT_m2n2 unit_init_2xmm(6,7) +#define INIT_m2n6 INIT_m2n4 unit_init_2xmm(8,9) +#define INIT_m2n8 INIT_m2n6 unit_init_2xmm(10,11) +#define INIT_m2n10 INIT_m2n8 unit_init_2xmm(12,13) +#define INIT_m2n12 INIT_m2n10 unit_init_2xmm(14,15) + +#define KERNEL_k1m1n1 \ + "vmovsd (%1),%%xmm1; addq $8,%1;"\ + "vfmadd231sd (%0),%%xmm1,%%xmm4; addq $8,%0;" +#define KERNEL_h_k1m1n2 \ + "vmovddup (%0),%%xmm1; addq $8,%0;"\ + "vfmadd231pd (%1),%%xmm1,%%xmm4;" +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vfmadd231pd (%1,%%r12,1),%%xmm1,%%xmm5;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n4 "vfmadd231pd (%1,%%r12,2),%%xmm1,%%xmm6; addq $16,%1;" +#define KERNEL_h_k1m1n8 KERNEL_k1m1n6 "vfmadd231pd (%%r15),%%xmm1,%%xmm7;" +#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%%r15;" +#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;" +#define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;" +#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;" +#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" + #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" + #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" + #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" + #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" + #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" +#endif +#define save_init_m1 "movq %2,%3; addq $8,%2;" +#ifdef TRMMKERNEL + #define SAVE_m1n1 "vmulsd %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;" + #define unit_save_m1n2(c1_no)\ + "vmulpd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define SAVE_m1n1 "vfmadd213sd (%2),%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;" + #define unit_save_m1n2(c1_no)\ + "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vfmadd231pd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define SAVE_m1n2 save_init_m1 unit_save_m1n2(4) +#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(5) +#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(6) +#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(7) +#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(8) +#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(9) +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m1n2 INIT_m1n1 +#define INIT_m1n4 INIT_m1n2 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n6 INIT_m1n4 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define INIT_m1n8 INIT_m1n6 "vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m1n10 INIT_m1n8 "vpxor %%xmm8,%%xmm8,%%xmm8;" +#define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;" + +#define COMPUTE_SIMPLE(mdim,ndim)\ + init_update_k(mdim) INIT_m##mdim##n##ndim "testq %%r13,%%r13; jz 7"#mdim"7"#ndim"9f;"\ + "movq %%r13,%5;" INIT_set_papb(mdim,ndim)\ + kernel_kstart_n##ndim(mdim,subq)\ + "7"#mdim"7"#ndim"1:\n\t"\ + KERNEL_k1m##mdim##n##ndim "decq %5; jnz 7"#mdim"7"#ndim"1b;"\ + "7"#mdim"7"#ndim"9:\n\t"\ + kernel_kend_n##ndim(mdim)\ + SAVE_set_pa(mdim) SAVE_m##mdim##n##ndim save_update_k(mdim) +#define COMPUTE_m16n1 COMPUTE_SIMPLE(16,1) +#define COMPUTE_m16n2 COMPUTE_SIMPLE(16,2) +#define COMPUTE_m16n4 COMPUTE_SIMPLE(16,4) +#define COMPUTE_m16n6 COMPUTE_SIMPLE(16,6) +#define COMPUTE_m16n8 COMPUTE_SIMPLE(16,8) +#define COMPUTE_m16n10 COMPUTE_SIMPLE(16,10) +#if defined(TRMMKERNEL) && !defined(LEFT) && defined(TRANSA) + #define INVERSE_K_MID "negq %5; leaq 6(%%r13,%5,1),%5;" +#else + #define INVERSE_K_MID "negq %5; leaq 16(%%r13,%5,1),%5;" +#endif +#define COMPUTE_m16n12 \ + init_update_k(16) INIT_m16n12 "movq %%r13,%5;" INIT_set_papb(16,12) "movq %2,%3;"\ + kernel_kstart_n12(16,subq)\ + "cmpq $16,%5; jb 7167123f; movq $16,%5;"\ + "7167121:\n\t"\ + KERNEL_k1m16n12 "addq $4,%5; testq $12,%5; movq $172,%%r10; cmovz %4,%%r10;"\ + KERNEL_k1m16n12 "prefetcht1 (%3); subq $129,%3; addq %%r10,%3;"\ + KERNEL_k1m16n12 "prefetcht1 (%6); addq $32,%6; cmpq $208,%5; cmoveq %2,%3;"\ + KERNEL_k1m16n12 "cmpq %5,%%r13; jnb 7167121b;"\ + "movq %2,%3;" INVERSE_K_MID\ + "7167123:\n\t"\ + "testq %5,%5; jz 7167129f;"\ + "7167125:\n\t"\ + "prefetcht0 (%3); prefetcht0 64(%3); prefetcht0 127(%3);"\ + KERNEL_k1m16n12 "addq %4,%3; decq %5;jnz 7167125b;"\ + "7167129:\n\t"\ + kernel_kend_n12(16)\ + "prefetcht0 (%%r14);" SAVE_set_pa(16) SAVE_m16n12 save_update_k(16) +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K; HEAD_SET_OFF(ndim)\ + __asm__ __volatile__(\ + "vbroadcastsd %8,%%zmm0; movq %7,%%r11; movq %1,%%r14; movq %10,%%r12; salq $4,%%r12;" INITASM_SET_K\ + "cmpq $16,%%r11; jb "#ndim"33102f;"\ + #ndim"33101:\n\t"\ + COMPUTE_m16n##ndim "subq $16,%%r11; cmpq $16,%%r11; jnb "#ndim"33101b;"\ + #ndim"33102:\n\t"\ + "cmpq $8,%%r11; jb "#ndim"33103f;"\ + COMPUTE_SIMPLE(8,ndim) "subq $8,%%r11;"\ + #ndim"33103:\n\t"\ + "cmpq $4,%%r11; jb "#ndim"33104f;"\ + COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ + #ndim"33104:\n\t"\ + "cmpq $2,%%r11; jb "#ndim"33105f;"\ + COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ + #ndim"33105:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"33106f;"\ + COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\ + #ndim"33106:\n\t"\ + "movq %%r14,%1;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ + "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\ +} + +#include "common.h" +#include + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif +) +{ + if(m==0||n==0||k==0||alpha==0.0) return 0; + int64_t ldc_in_bytes = (int64_t)ldc * sizeof(double); double ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; + BLASLONG n_count = n, off = 0; + double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B; +#ifdef TRMMKERNEL + #ifdef LEFT + off = offset; + #else + off = -offset; + #endif +#endif + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>9;n_count-=10) COMPUTE(10) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} + From f3f969f681ffc67f81ac78d37e51f83129232985 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 3 Feb 2020 21:34:12 +0800 Subject: [PATCH 861/935] Update param.h --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index 075c12ca2..219d99fc6 100644 --- a/param.h +++ b/param.h @@ -1660,14 +1660,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 @@ -1701,12 +1701,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 640 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 192 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 320 -#define DGEMM_DEFAULT_Q 168 +#define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 From 081b1885294afedf7f5ee87e1d9bd8b82d096664 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 3 Feb 2020 21:38:08 +0800 Subject: [PATCH 862/935] Update KERNEL.SKYLAKEX --- kernel/x86_64/KERNEL.SKYLAKEX | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 0e6275748..dcd201649 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,10 +7,13 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMKERNEL = dgemm_kernel_4x8_skylakex_2.c - -DGEMMONCOPY = dgemm_ncopy_8_skylakex.c -DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c +DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c +DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c From 83b6be7976dd02973851c6df68e579a024aebfcc Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 4 Feb 2020 19:55:26 +0800 Subject: [PATCH 863/935] Update param.h --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 219d99fc6..e6ab93aa5 100644 --- a/param.h +++ b/param.h @@ -1711,7 +1711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R 13824 +#define DGEMM_DEFAULT_R 8640 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r From 1c3e20ce483c5b6a9bb457bd199c16a0b0bbd8de Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 4 Feb 2020 20:30:23 +0800 Subject: [PATCH 864/935] Update level3.c --- driver/level3/level3.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 1ab7a740e..9aa67286f 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); From 77b8f49556096952fb4495b7019b58597f81dce8 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 4 Feb 2020 20:33:08 +0800 Subject: [PATCH 865/935] Update level3_thread.c --- driver/level3/level3_thread.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index cfbff7554..bf558447e 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif /* Copy part of local region of B into workspace */ START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, From 833bd0f8ffd1b7921ca8e98196e40183158f8cb7 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 5 Feb 2020 10:09:41 +0800 Subject: [PATCH 866/935] Update trmm_L.c --- driver/level3/trmm_L.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 8a81d31a0..9117090b5 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); @@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); @@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, @@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, From 2f96a2c55b2cfae827973b3002ae5af8bfa6d7d6 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 5 Feb 2020 10:15:02 +0800 Subject: [PATCH 867/935] Update trmm_R.c --- driver/level3/trmm_R.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 0882aa496..62c6a2442 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else @@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); #else @@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else @@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); #else @@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); @@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; +#ifdef SKYLAKEX + /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ + if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - +#endif #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else From 096da2f51acc37d67e84646336753a77b4b007ad Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 5 Feb 2020 13:36:57 +0800 Subject: [PATCH 868/935] Update dgemm_kernel_16x2_skylakex.c --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 250ff8d49..743ad5aa7 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -1,3 +1,6 @@ +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = size_of_1_tile_in_b, r13 = k, r14 = b_head, r15 = %1+3*r12 */ + #if (defined (LEFT) && !defined(TRANSA)) || (!defined (LEFT) && defined(TRANSA)) #define BACKWARDS 1 #else From 0b909203cb31f4667bfc9172d475d20d53bbad75 Mon Sep 17 00:00:00 2001 From: w00421467 Date: Wed, 5 Feb 2020 14:53:37 +0800 Subject: [PATCH 869/935] Fix bugs in benchmark of gemv --- benchmark/gemv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index b6a42f42f..781df695e 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -197,7 +197,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -208,7 +208,7 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); From ce9ea8f826e5a9ded2719b4104ca4901bab5b2fe Mon Sep 17 00:00:00 2001 From: w00421467 Date: Wed, 5 Feb 2020 15:07:18 +0800 Subject: [PATCH 870/935] Fix another branch --- benchmark/gemv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index 781df695e..adf8f3d91 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -234,7 +234,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -245,7 +245,7 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); From 4e00d96a78b8b2d7b6cda9dafc9a72b6777d8828 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 6 Feb 2020 01:46:36 +0000 Subject: [PATCH 871/935] Update dgemm_kernel_16x2_skylakex.c --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 743ad5aa7..4c4c2f4e4 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -467,7 +467,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, #endif ) { - if(m==0||n==0||k==0||alpha==0.0) return 0; + if(m==0||n==0) return 0; int64_t ldc_in_bytes = (int64_t)ldc * sizeof(double); double ALPHA = alpha; int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; BLASLONG n_count = n, off = 0; From 8b5cdcc64c0b5ed4fc58e9bd6a314759eb6c1294 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 6 Feb 2020 01:47:46 +0000 Subject: [PATCH 872/935] Update sgemm_kernel_8x4_haswell.c --- kernel/x86_64/sgemm_kernel_8x4_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c index 2b8aa9862..2250e4b97 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c @@ -467,7 +467,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f ,BLASLONG offset #endif ){ - if(m==0||n==0||k==0||alpha==0.0) return 0; + if(m==0||n==0) return 0; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); float constval = alpha; float *const_val=&constval; From 3447d04eaf3c98b4fcd1ba41fa13774ff15c72e3 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 6 Feb 2020 02:14:10 +0000 Subject: [PATCH 873/935] Update dgemm_kernel_16x2_skylakex.c --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 4c4c2f4e4..416ace59b 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -393,9 +393,10 @@ #define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;" #define COMPUTE_SIMPLE(mdim,ndim)\ - init_update_k(mdim) INIT_m##mdim##n##ndim "testq %%r13,%%r13; jz 7"#mdim"7"#ndim"9f;"\ + init_update_k(mdim) INIT_m##mdim##n##ndim\ "movq %%r13,%5;" INIT_set_papb(mdim,ndim)\ kernel_kstart_n##ndim(mdim,subq)\ + "testq %5,%5; jz 7"#mdim"7"#ndim"9f;"\ "7"#mdim"7"#ndim"1:\n\t"\ KERNEL_k1m##mdim##n##ndim "decq %5; jnz 7"#mdim"7"#ndim"1b;"\ "7"#mdim"7"#ndim"9:\n\t"\ From 9694037b2317b944e925a98a4d3e222e9d70311e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Feb 2020 10:09:25 +0100 Subject: [PATCH 874/935] Set SUFFIX in tempfile commands, fix bad architecture option for PGI compiler in avx512 test --- c_check | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/c_check b/c_check index 3d82aa73c..543789207 100644 --- a/c_check +++ b/c_check @@ -188,13 +188,13 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { if ($@){ warn "could not load PERL module File::Temp, so could not check MSA capatibility"; } else { - $tmpf = new File::Temp( UNLINK => 1 ); + $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $code = '"addvi.b $w0, $w1, 1"'; $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; print $tmpf "#include \n\n"; print $tmpf "void main(void){ __asm__ volatile($code); }\n"; - $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + $args = "$msa_flags -o $tmpf.o $tmpf"; my @cmd = ("$compiler_name $args"); system(@cmd) == 0; if ($? != 0) { @@ -229,10 +229,13 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { $no_avx512 = 0; } else { # $tmpf = new File::Temp( UNLINK => 1 ); - ($fh,$tmpf) = tempfile( UNLINK => 1 ); + ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; + $args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf"; + if ($compiler eq "PGI") { + $args = " -tp skylake -c -o $tmpf.o $tmpf"; + } my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { From 68a43db35882aebe8eefff1756eb86d2ffb31b1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Feb 2020 10:15:18 +0100 Subject: [PATCH 875/935] Fix utest compilation with PGI --- utest/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index 8c7e6b9f8..bd4bdf3ae 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -31,6 +31,10 @@ OBJS += test_fork.o endif endif +ifeq ($(C_COMPILER), PGI) +OBJS = utest_main2.o +endif + all : run_test $(UTESTBIN): $(OBJS) From 598984152426e611e22ebe064745b7af602914af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Feb 2020 13:01:31 +0100 Subject: [PATCH 876/935] Add PGI to avx512-supporting compilers --- getarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index 1f590390a..3a78771c2 100644 --- a/getarch.c +++ b/getarch.c @@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6) || defined(__PGI)) #else #define NO_AVX512 #endif From d55b10830f9a077fea3a1e785bf214370ef0f959 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Feb 2020 16:02:17 +0100 Subject: [PATCH 877/935] Remove OpenMP libraries from link list --- c_check | 1 + 1 file changed, 1 insertion(+) diff --git a/c_check b/c_check index 543789207..fbd1838aa 100644 --- a/c_check +++ b/c_check @@ -321,6 +321,7 @@ $linker_a = ""; && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) + && ($flags !~ /omp/) ) { $linker_l .= $flags . " " } From cfe63d8cc202ca7dc9cf58b27f3a898b6087cfac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Feb 2020 16:03:51 +0100 Subject: [PATCH 878/935] Remove OpenMP libraries from link list --- f_check | 1 + 1 file changed, 1 insertion(+) diff --git a/f_check b/f_check index 79b24e2dc..fac8fc707 100644 --- a/f_check +++ b/f_check @@ -334,6 +334,7 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) + && ($flags !~ /omp/) && ($flags !~ /^\-l$/) ) { $linker_l .= $flags . " "; From b3cbd60d7aa39ea76d5a2590ee5eb54038fea3c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Feb 2020 10:20:13 +0100 Subject: [PATCH 879/935] Remove PGI from list again as it is actually still not capable --- getarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index 3a78771c2..1f590390a 100644 --- a/getarch.c +++ b/getarch.c @@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6) || defined(__PGI)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) #else #define NO_AVX512 #endif From 50545b19d083558117337a393eff9ee656b22fd7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 00:06:07 +0100 Subject: [PATCH 880/935] Update CPU and OS support and document DYNAMIC_ARCH option in README.md prompted by #2388 --- README.md | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 14815ff00..b1ecc6521 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code using Git from https://github.com/xianyi/OpenBLAS.git. +Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. +Most can also be given directly on the make or cmake command line. ### Dependencies @@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`. ## Supported CPUs and Operating Systems -Please read `GotoBLAS_01Readme.txt`. +Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS. ### Additional supported CPUs @@ -109,8 +111,8 @@ Please read `GotoBLAS_01Readme.txt`. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. -- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. -- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. +- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. +- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -129,8 +131,15 @@ Please read `GotoBLAS_01Readme.txt`. #### ARM64 -- **ARMv8**: Experimental -- **ARM Cortex-A57**: Experimental +- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS +- **Cortex-A53**: same as ARMV8 (different cpu specifications) +- **Cortex A57**: Optimized Level-3 and Level-2 functions +- **Cortex A72**: same as A57 ( different cpu specifications) +- **Cortex A73**: same as A57 (different cpu specifications) +- **Falkor**: same as A57 (different cpu specifications) +- **ThunderX**: Optimized some Level-1 functions +- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 +- **TSV110**: Optimized some Level-3 helper functions #### PPC/PPC64 @@ -139,18 +148,34 @@ Please read `GotoBLAS_01Readme.txt`. #### IBM zEnterprise System -- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) -- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) +- **Z13**: Optimized Level-3 BLAS and Level-1,2 +- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2 + +### Support for multiple targets in a single library + +OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake. +For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default. +DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, +Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. +On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. +For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. +The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the +common code in the library, usually you will want to set this to the oldest model you expect to encounter. +Please not that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. ### Supported OS - **GNU/Linux** - **MinGW or Visual Studio (CMake)/Windows**: Please read . -- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts. +- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts. - **FreeBSD**: Supported by the community. We don't actively test the library on this OS. - **OpenBSD**: Supported by the community. We don't actively test the library on this OS. +- **NetBSD**: Supported by the community. We don't actively test the library on this OS. - **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. - **Android**: Supported by the community. Please read . +- **AIX**: Supported on PPC up to POWER8 +- **Haiku**: Supported by the community. We don't actively test the library on this OS. +- **SunOS**: Supported by the community. We don't actively test the library on this OS: ## Usage From 47c1bf7f4de7d6b91d6b8f419eb0854e4e94e413 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 01:06:40 +0100 Subject: [PATCH 881/935] typo fixes --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b1ecc6521..04f43f4c7 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. -Please not that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. +Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. ### Supported OS @@ -230,7 +230,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. * Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. -* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), +* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. * OpenBLAS does not set processor affinity by default. From 32d97330b3071fff273cb2a3fa136666a21b7fb4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:00:36 +0100 Subject: [PATCH 882/935] Update with changes from 0.3.8 --- Changelog.txt | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index f160a4e13..549ca4aa5 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,59 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.8 + 9-Feb-2020 + +common: +` * LAPACK has been updated to 3.9.0 (plus patches up to + January 2nd, 2020) + * CMAKE support has been improved in several areas including + cross-compilation + * a thread race condition in the GEMM3M kernels was resolved + * the "generic" (plain C) gemm beta kernel used by many targets + has been sped up + * an optimized version of the LAPACK trtrs functions has been added + * an incompatibilty between the LAPACK tests and the OpenBLAS + implementation of XERBLA was resolved, removing the numerous + warnings about wrong error exits in the former + * support for NetBSD has been added + * support for compilation with g95 and non-GNU versions of ld + has been improved + * support for compilation with (upcoming) gcc 10 has been added + +POWER: + * worked around miscompilation of several POWER8 and POWER9 + kernels by older versions of gcc + * added support for big-endian POWER8 and for compilation on AIX + * corrected bugs in the big-endian support for PPC440 and PPC970 + * DYNAMIC_ARCH support is now available in CMAKE builds as well + +ARMV8: + * performance of DGEMM_BETA and SGEMM_NCOPY has been improved + * compilation for 32bit works again + * performance of the RPCC function has been improved + * improved performance on small systems + * DYNAMIC_ARCH support is now available in CMAKE builds as well + * cross-compilation from OSX to IOS was simplified + +x86_64: + * a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel + was significantly improved + * optimized AVX512 kernels for CGEMM and ZGEMM have been added + * AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly + sped up and optimized CGEMM3M and ZGEMM3M kernels have been added + * added support for QEMU virtual cpus + * a compilation problem with PGI and SUN compilers was fixed + * Intel "Goldmont plus" is now autodetected + * a potential crash on program exit on MS Windows has been fixed + +x86: + * an unwanted case sensitivity in the implementation of LSAME + on older 32bit AMD cpus was fixed + +zarch: + * Z15 is now supported as Z14 + * DYNAMIC_ARCH is now available on ZARCH as well + ==================================================================== Version 0.3.7 11-Aug 2019 From f03dd23e90a4e248903b4c2f66c3507da716c38d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:18:07 +0100 Subject: [PATCH 883/935] Increment version to 0.3.9.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e5a1b551..951271717 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 8) +set(OpenBLAS_PATCH_VERSION 9.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 3bec250cf995c8f96e0849e582227589234effcd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:18:44 +0100 Subject: [PATCH 884/935] Increment version to 0.3.9.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 618034bb8..21b7e138a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.8 +VERSION = 0.3.9.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 579be3aa9d0e196fc9cc91f6e1f2372e87638f78 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:28:04 +0100 Subject: [PATCH 885/935] Add configuration option for BUFFER_SIZE --- Makefile.rule | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 21b7e138a..724a60ec4 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -97,6 +97,15 @@ VERSION = 0.3.9.dev # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 +# When multithreading, OpenBLAS needs to use a memory buffer for communicating +# and collating results for individual subranges of the original matrix. Since +# the original GotoBLAS of the early 2000s, the default size of this buffer has +# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC. +# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment +# this line and adjust the (32< Date: Sun, 9 Feb 2020 23:30:22 +0100 Subject: [PATCH 886/935] Make BUFFER_SIZE configurable --- common_x86_64.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index c05998d58..fe5539abe 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -225,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #define HUGE_PAGESIZE ( 2 << 20) +#ifndef BUFFERSIZE #define BUFFER_SIZE (32 << 20) +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif #define SEEK_ADDRESS From 7f0d523b42feb70e7b8ad299d8005d73f620f219 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2020 23:32:57 +0100 Subject: [PATCH 887/935] Make BUFFER_SIZE configurable --- cmake/system.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4f8011603..ce980a7b9 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -289,6 +289,10 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") +if (BUFFERSIZE) +set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}") +endif () + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () From 754433f4208832d14119b398e8dd46fbc448a828 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 10 Feb 2020 19:11:45 +0800 Subject: [PATCH 888/935] =?UTF-8?q?Avoid=20printing=20the=20following=20in?= =?UTF-8?q?formation=20on=20mips=20and=20mips64=20when=20check=20msa:=20"u?= =?UTF-8?q?nrecognized=20command=20line=20option=20=E2=80=98-mmsa=E2=80=99?= =?UTF-8?q?"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index fbd1838aa..555b2eccf 100644 --- a/c_check +++ b/c_check @@ -195,7 +195,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { print $tmpf "void main(void){ __asm__ volatile($code); }\n"; $args = "$msa_flags -o $tmpf.o $tmpf"; - my @cmd = ("$compiler_name $args"); + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $have_msa = 0; From 303bdb673b8ef7b9e2ccdbb331827e00a1293951 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Feb 2020 19:17:32 +0100 Subject: [PATCH 889/935] Fix coretype detection for Intel extended models 6 and 7 affecting Goldmont, Cannon Lake, Ice Lake autodetection --- cpuid_x86.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 9e1c8e752..e29adecae 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2006,6 +2006,38 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 6: + if (model == 6) +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif + break; + case 7: + if (model == 10) + return CORE_NEHALEM; + if (model == 14) +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif + break; case 9: case 8: if (model == 14) { // Kaby Lake From 7e5cbb6f3554342151c522ad4ab20111bf48f5d3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Feb 2020 21:17:39 +0100 Subject: [PATCH 890/935] Fix bad conditional syntax that caused spurious application of USE_TRMM --- kernel/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index ad15b8f25..b3310e87e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -121,8 +121,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - - if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") + if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) + set(USE_TRMM true) + endif () + if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) set(USE_TRMM true) endif () From dff173e50e01d94e0741e4b4eaa1cf0aa01cf320 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 14:46:30 +1300 Subject: [PATCH 891/935] Fix typo in dynamic_zarch.c --- driver/others/dynamic_zarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 1206bf870..896e65bb4 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -31,7 +31,7 @@ char* gotoblas_corename(void) { } // __builtin_cpu_is is not supported by zarch -static gotolabs_t* get_coretype(void) { +static gotoblas_t* get_coretype(void) { FILE* infile; char buffer[512], * p; From 5a6bba3061f19923eb9972378021e6498bf8e5ed Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 15:07:33 +1300 Subject: [PATCH 892/935] Patch out instances of Z15 in dynamic_zarch.c There does not appear to be a Z15 kernel yet, causing link errors from the code. This patch fixes the issue. --- driver/others/dynamic_zarch.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 1206bf870..c7b82e4df 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -3,12 +3,12 @@ extern gotoblas_t gotoblas_Z13; extern gotoblas_t gotoblas_Z14; -extern gotoblas_t gotoblas_Z15; +//extern gotoblas_t gotoblas_Z15; //#if (!defined C_GCC) || (GCC_VERSION >= 60000) //extern gotoblas_t gotoblas_Z14; //#endif -#define NUM_CORETYPES 5 +#define NUM_CORETYPES 4 extern void openblas_warning(int verbose, const char* msg); @@ -16,14 +16,14 @@ static char* corename[] = { "unknown", "Z13", "Z14", - "Z15", +// "Z15", "ZARCH_GENERIC", }; char* gotoblas_corename(void) { if (gotoblas == &gotoblas_Z13) return corename[1]; if (gotoblas == &gotoblas_Z14) return corename[2]; - if (gotoblas == &gotoblas_Z15) return corename[3]; +// if (gotoblas == &gotoblas_Z15) return corename[3]; //#if (!defined C_GCC) || (GCC_VERSION >= 60000) // if (gotoblas == &gotoblas_POWER9) return corename[3]; //#endif @@ -78,7 +78,7 @@ static gotoblas_t* force_coretype(char* coretype) { { case 1: return (&gotoblas_Z13); case 2: return (&gotoblas_Z14); - case 3: return (&gotoblas_Z15); +// case 3: return (&gotoblas_Z15); //#if (!defined C_GCC) || (GCC_VERSION >= 60000) // case 3: return (&gotoblas_POWER9); //#endif From 7ea5e07d1cb59834428d982818b7cf565dcda4df Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Wed, 12 Feb 2020 14:11:44 +0000 Subject: [PATCH 893/935] Fix inline asm in dscal: mark x, x1 as clobbered. Fixes #2408 The leaq instructions in dscal_kernel_inc_8 modify x and x1 so they must be declared as input/output constraints, otherwise the compiler may assume the corresponding registers are not modified. --- kernel/x86_64/dscal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d0d7801fd..e2436f789 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -136,10 +136,10 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ "jnz 1b \n\t" : - "+r" (n) // 0 + "+r" (n), // 0 + "+r" (x), // 1 + "+r" (x1) // 2 : - "r" (x), // 1 - "r" (x1), // 2 "r" (alpha), // 3 "r" (inc_x), // 4 "r" (inc_x3) // 5 From dc345d84df90a54e6f416ab7f3604b645297df23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 19:56:52 +0100 Subject: [PATCH 894/935] Fix syntax of endianness conditional and add gcc version check for workaround --- kernel/power/KERNEL.POWER8 | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fb9452a35..ba9a99cd1 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,30 +89,52 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power8.S else ISAMAXKERNEL = isamax.c endif +else +ISAMAXKERNEL = isamax.c +endif +# IDAMAXKERNEL = idamax.c -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +# +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ICAMAXKERNEL = icamax_power8.S else ICAMAXKERNEL = icamax.c endif +else +ICAMAXKERNEL = icamax.c +endif +# IZAMAXKERNEL = izamax.c # -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ISAMINKERNEL = isamin_power8.S else ISAMINKERNEL = isamin.c endif +else +ISAMINKERNEL = isamin.c +endif +# IDAMINKERNEL = idamin.c -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +# +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) ICAMINKERNEL = icamin_power8.S else ICAMINKERNEL = icamin.c endif +else +ICAMINKERNEL = icamin.c +endif +# IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -128,11 +150,16 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power8.S else CAXPYKERNEL = caxpy.c endif +else +CAXPYKERNEL = caxpy.c +endif +# ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c From 120d20731face6457aa9f234eeaecf45f51a0b51 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 19:58:42 +0100 Subject: [PATCH 895/935] Fix syntax of endianness conditional --- kernel/power/KERNEL.PPC440 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index a0696b548..8d64d3fc1 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -15,7 +15,7 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c else @@ -25,7 +25,7 @@ endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S else @@ -62,7 +62,7 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S -ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S else @@ -132,7 +132,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c From 0544cbc806466a42a01b8235c87938ed93c221d6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 20:00:29 +0100 Subject: [PATCH 896/935] Fix syntax of endianness conditional --- kernel/power/KERNEL.PPC970 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index de30977de..cc1f215de 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,4 +1,4 @@ -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") SGEMMKERNEL = gemm_kernel.S SGEMMINCOPY = SGEMMITCOPY = @@ -30,7 +30,7 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") CGEMMKERNEL = zgemm_kernel.S CGEMMINCOPY = CGEMMITCOPY = @@ -72,7 +72,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S -ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S From 7c162b8a2104d316e05883f77c98bc795ccc7f81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 23:56:57 +0100 Subject: [PATCH 897/935] Update isamax_power8.S --- kernel/power/isamax_power8.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S index fa5433333..7938779c9 100644 --- a/kernel/power/isamax_power8.S +++ b/kernel/power/isamax_power8.S @@ -11,7 +11,8 @@ #include "common.h" PROLOGUE - + +isamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From de40d47edf316a26f5607a242f8bda69c9f06caf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 23:57:48 +0100 Subject: [PATCH 898/935] Update isamin_power8.S --- kernel/power/isamin_power8.S | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S index c9b6acb85..432c0e776 100644 --- a/kernel/power/isamin_power8.S +++ b/kernel/power/isamin_power8.S @@ -11,6 +11,7 @@ PROLOGUE +isamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 8eefa530cd141be1b38fe12ebc3831ba9c285c15 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2020 23:59:50 +0100 Subject: [PATCH 899/935] Update isamax_power8.S --- kernel/power/isamax_power8.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S index 7938779c9..ec1c283f3 100644 --- a/kernel/power/isamax_power8.S +++ b/kernel/power/isamax_power8.S @@ -12,7 +12,9 @@ PROLOGUE +#if _CALL_ELF == 2 isamax_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 5ba3699f419185898d368a4b7618d396dff419ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 00:00:32 +0100 Subject: [PATCH 900/935] Update isamin_power8.S --- kernel/power/isamin_power8.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S index 432c0e776..1978af880 100644 --- a/kernel/power/isamin_power8.S +++ b/kernel/power/isamin_power8.S @@ -11,7 +11,9 @@ PROLOGUE +#if _CALL_ELF ==2 isamin_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 0e05ea9baca5d2c95a063a579367522d1ada811f Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 13 Feb 2020 14:51:55 +0100 Subject: [PATCH 901/935] Add CMake related files to .gitignore. --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e9d08ca7e..6803a919e 100644 --- a/.gitignore +++ b/.gitignore @@ -87,4 +87,5 @@ build.* *.swp benchmark/*.goto benchmark/smallscaling - +CMakeCache.txt +CMakeFiles/* From 486c35c5dc1f3fa3bae87e0256342068070c9ed6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 18:38:43 +0100 Subject: [PATCH 902/935] Update icamin_power8.S --- kernel/power/icamin_power8.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S index f2993e83e..e4469eb64 100644 --- a/kernel/power/icamin_power8.S +++ b/kernel/power/icamin_power8.S @@ -10,7 +10,9 @@ #include "common.h" PROLOGUE - +#if _CALL_ELF ==2 +icamin_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From 92ca92a46c98f09121844199fa853ed0428ab521 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 21:24:54 +0100 Subject: [PATCH 903/935] Update caxpy_power8.S --- kernel/power/caxpy_power8.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index 294a1d24d..dbbd55196 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -12,6 +12,13 @@ PROLOGUE +#if _CALL_ELF ==2 +#ifdef CONJ +caxpyc_k: +#else +caxpyc: +#endif +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l From cafdd999b81a08b1a003d53e52e53ee04b1d1842 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Feb 2020 22:44:09 +0100 Subject: [PATCH 904/935] Update caxpy_power8.S --- kernel/power/caxpy_power8.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index dbbd55196..10a372832 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -16,7 +16,7 @@ #ifdef CONJ caxpyc_k: #else -caxpyc: +caxpy_k: #endif #endif .LCF0: From eb285b4d20a46ce8db7f6234c9d4d804d81be41e Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 14 Feb 2020 10:45:31 +0100 Subject: [PATCH 905/935] Make ctest verbose for drone builder. --- .drone.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.drone.yml b/.drone.yml index 779912954..696c5a99d 100644 --- a/.drone.yml +++ b/.drone.yml @@ -92,7 +92,7 @@ steps: - mkdir build && cd build - cmake $CMAKE_FLAGS .. - make -j - - ctest + - ctest -V --- kind: pipeline @@ -116,7 +116,7 @@ steps: - mkdir build && cd build - cmake $CMAKE_FLAGS .. - make -j - - ctest + - ctest -V --- kind: pipeline @@ -140,4 +140,4 @@ steps: - mkdir build && cd build - cmake $CMAKE_FLAGS .. - make -j - - ctest + - ctest -V From c222b25b81672b0af5b4c550915d097be3d6dd88 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Feb 2020 19:29:14 +0100 Subject: [PATCH 906/935] Correct generation of GETRF files by the CMAKE build fixes #2396 --- lapack/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index d48a270ab..e21a9aabb 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -4,7 +4,6 @@ include_directories(${PROJECT_BINARY_DIR}) set(LAPACK_SOURCES - getrf/getrf_single.c potrf/potrf_U_single.c potrf/potrf_L_single.c lauum/lauum_U_single.c @@ -45,6 +44,10 @@ GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) +foreach (float_type ${FLOAT_TYPES}) +GenerateNamedObjects("getrf/getrf_single.c" "UNIT" "getrf_single" false "" "" false ${float_type}) +endforeach () + # dynamic_arch laswp needs arch specific code ? #foreach(TARGET_CORE ${DYNAMIC_CORE}) # set(TSUFFIX "_${TARGET_CORE}") @@ -81,7 +84,7 @@ if (USE_THREAD) ) foreach (float_type ${FLOAT_TYPES}) - GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type}) + GenerateNamedObjects("${GETRF_SRC}" "UNIT" "getrf_parallel" false "" "" false ${float_type}) endforeach() GenerateNamedObjects("${PARALLEL_SOURCES}") From 46e4b12946ddbed0eb6bb42092fc0f8b21561a8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Feb 2020 23:06:51 +0100 Subject: [PATCH 907/935] Update KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fb9452a35..ab03e63ea 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -86,7 +86,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMAXKERNEL = ../arm/max.c #DMAXKERNEL = ../arm/max.c # -#SMINKERNEL = ../arm/min.c +SMINKERNEL = min_ppc440.S #DMINKERNEL = ../arm/min.c # ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) From d92bd5be246f9bbddb7a5406526f992e5c4ac72e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Feb 2020 23:07:50 +0100 Subject: [PATCH 908/935] Update KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index ab03e63ea..da765a26e 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -86,7 +86,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMAXKERNEL = ../arm/max.c #DMAXKERNEL = ../arm/max.c # -SMINKERNEL = min_ppc440.S +#SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) From d483e9270ab21e2171e33803ac7de60f2c5e0fae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2020 17:29:35 +0100 Subject: [PATCH 909/935] Update KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 14de0c08d..ba9a99cd1 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -86,7 +86,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMAXKERNEL = ../arm/max.c #DMAXKERNEL = ../arm/max.c # -#SMINKERNEL = ../arm/min.c +#SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") From e32f3b144766362dad5f9def1005076c9ea82b85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2020 17:32:13 +0100 Subject: [PATCH 910/935] Restore -march flag for Android builds fixes #2419 - renewed discussion in #2112 suggests removal of the option was primarily aimed at non-Android builds --- Makefile.arm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index b5d80f8e6..fac6b56824 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -FCOMMON_OPT += -mfpu=neon +CCOMMON_OPT += -mfpu=neon -march=armv7-a +FCOMMON_OPT += -mfpu=neon -march=armv7-a else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a From 4326dcb460c78a062743fd5d7c7f0ac520ff7b56 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 16 Feb 2020 15:11:40 -0600 Subject: [PATCH 911/935] Pass CFLAGS from env to Makefile.prebuild and remove iOS hack --- .travis.yml | 2 +- Makefile.prebuild | 8 ++++---- Makefile.system | 2 +- c_check | 28 ++++++++-------------------- 4 files changed, 14 insertions(+), 26 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9e18412e8..0f20aef5c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -176,7 +176,7 @@ matrix: - <<: *test-macos osx_image: xcode10.1 env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" + - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" diff --git a/Makefile.prebuild b/Makefile.prebuild index a366004a1..b00f13368 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -42,7 +42,7 @@ all: getarch_2nd ./getarch_2nd 1 >> $(TARGET_CONF) config.h : c_check f_check getarch - perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) + perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS) ifneq ($(ONLY_CBLAS), 1) perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) else @@ -59,13 +59,13 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - $(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) + $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c config.h dummy ifndef TARGET_CORE - $(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c + $(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c else - $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c + $(HOSTCC) -I. $(HOST_CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c endif dummy: diff --git a/Makefile.system b/Makefile.system index c0e45515f..cf9e9bafa 100644 --- a/Makefile.system +++ b/Makefile.system @@ -214,7 +214,7 @@ ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf diff --git a/c_check b/c_check index 555b2eccf..c7899c84f 100644 --- a/c_check +++ b/c_check @@ -18,11 +18,12 @@ $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); $config = shift(@ARGV); -$compiler_name = join(" ", @ARGV); +$compiler_name = shift(@ARGV); +$flags = join(" ", @ARGV); # First, we need to know the target OS and compiler name -$data = `$compiler_name -E ctest.c`; +$data = `$compiler_name $flags -E ctest.c`; if ($?) { printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; @@ -175,7 +176,7 @@ if ($defined == 0) { # Do again -$data = `$compiler_name -E ctest.c`; +$data = `$compiler_name $flags -E ctest.c`; if ($?) { printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; @@ -195,7 +196,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { print $tmpf "void main(void){ __asm__ volatile($code); }\n"; $args = "$msa_flags -o $tmpf.o $tmpf"; - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $have_msa = 0; @@ -236,7 +237,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { if ($compiler eq "PGI") { $args = " -tp skylake -c -o $tmpf.o $tmpf"; } - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $no_avx512 = 1; @@ -247,7 +248,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } } -$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; +$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -263,19 +264,6 @@ if ($architecture ne $hostarch) { $cross = 1 if ($os ne $hostos); -# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS -# the initial autodetection will have been confused by the command-line arguments to clang -# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output -if (($os eq "Darwin") && ($cross_suffix ne "")) { - my $tmpnam = `xcrun --sdk iphoneos --find clang`; - $cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 ); -# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/"; - $cross =1; - $architecture = arm64; -} - - - $openmp = "" if $ENV{USE_OPENMP} != 1; $linker_L = ""; @@ -283,7 +271,7 @@ $linker_l = ""; $linker_a = ""; { - $link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; + $link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; $link =~ s/\-Y\sP\,/\-Y/g; From 0e7f43c898ea646b8bfec982657e77ac09e9f118 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 14 Feb 2020 10:35:51 +0100 Subject: [PATCH 912/935] Add missing USE_MIN in kernel/CMakeLists.txt. --- kernel/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index b3310e87e..35e0fff25 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -47,7 +47,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type}) endif () if (DEFINED ${float_char}MINKERNEL) - GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "USE_MIN" "min_k" false "" "" false ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type}) @@ -55,7 +55,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type}) endif () if (DEFINED I${float_char}MINKERNEL) - GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "USE_MIN" "i*min_k" false "" "" false ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type}) From 18bcc36a6996d117c156394b047ec1c0e298e202 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 13 Feb 2020 14:32:24 +0100 Subject: [PATCH 913/935] Fix implementation of iamax_sse.S as reported in #2116. The was a typo in iamax_sse.S where one of the comparison was cmpeqps instead of cmpeqss. That misdetected index for sequences where the minimum value was 0. --- kernel/x86_64/KERNEL | 4 +- kernel/x86_64/iamax_sse.S | 6 +-- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_ismin.c | 89 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 8 deletions(-) create mode 100644 utest/test_ismin.c diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 92d121ab2..4874711bb 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S endif ifndef ISAMINKERNEL -ISAMINKERNEL = iamax.S +ISAMINKERNEL = iamax_sse.S endif ifndef IDAMINKERNEL @@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax.S +ISMINKERNEL = iamax_sse.S endif ifndef IDMINKERNEL diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index d50c1699c..9c7af1fd7 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -36,10 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -/* This kernel was found to give wrong results when used for ISMIN/ISAMIN - with increment != 1, although it appears to be correct for corresponding - MAX operations. See issue 2116 */ - #define ASSEMBLER #include "common.h" @@ -863,7 +859,7 @@ #ifdef USE_ABS andps %xmm15, %xmm5 #endif - cmpeqps %xmm0, %xmm5 + cmpeqss %xmm0, %xmm5 movss 0 * SIZE(X), %xmm6 addq INCX, X diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1e3051a8f..544646911 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -7,6 +7,7 @@ else () set(OpenBLAS_utest_src utest_main.c test_amax.c + test_ismin.c test_rotmg.c test_rot.c test_axpy.c diff --git a/utest/Makefile b/utest/Makefile index bd4bdf3ae..32bdcc6e1 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o +OBJS=utest_main.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_ismin.c b/utest/test_ismin.c new file mode 100644 index 000000000..f23d6b545 --- /dev/null +++ b/utest/test_ismin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#define ELEMENTS 50 +#define INCREMENT 2 + +CTEST(ismin, positive_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0; + blasint index = BLASFUNC(ismin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +CTEST(ismin, negative_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -123456.0f; + blasint index = BLASFUNC(ismin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +CTEST(ismax, positive_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 123456.0f; + blasint index = BLASFUNC(ismax)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +CTEST(ismax, negative_step_2){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = 0; + blasint index = BLASFUNC(ismax)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} From aeea14ee4096860a75818547931c9dae43f965da Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 13 Feb 2020 14:42:45 +0100 Subject: [PATCH 914/935] Come up with LOAD_AND_COMPARE_TO_MXX macro in iamax_sse.S. --- kernel/x86_64/iamax_sse.S | 72 +++++++++------------------------------ 1 file changed, 17 insertions(+), 55 deletions(-) diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index 9c7af1fd7..4f62b9be2 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -55,6 +55,15 @@ #define MAXSS minss #endif +.macro LOAD_AND_COMPARE_TO_MXX REG + movss 0 * SIZE(X), \REG + addq INCX, X +#ifdef USE_ABS + andps %xmm15, \REG +#endif + cmpeqss %xmm0, \REG +.endm + #include "l1param.h" PROLOGUE @@ -826,61 +835,14 @@ ALIGN_4 .L93: - movss 0 * SIZE(X), %xmm1 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm1 -#endif - cmpeqss %xmm0, %xmm1 - - movss 0 * SIZE(X), %xmm2 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm2 -#endif - cmpeqss %xmm0, %xmm2 - - movss 0 * SIZE(X), %xmm3 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm3 -#endif - cmpeqss %xmm0, %xmm3 - - movss 0 * SIZE(X), %xmm4 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm4 -#endif - cmpeqss %xmm0, %xmm4 - - movss 0 * SIZE(X), %xmm5 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm5 -#endif - cmpeqss %xmm0, %xmm5 - - movss 0 * SIZE(X), %xmm6 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm6 -#endif - cmpeqss %xmm0, %xmm6 - - movss 0 * SIZE(X), %xmm7 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm7 -#endif - cmpeqss %xmm0, %xmm7 - - movss 0 * SIZE(X), %xmm8 - addq INCX, X -#ifdef USE_ABS - andps %xmm15, %xmm8 -#endif - cmpeqss %xmm0, %xmm8 + LOAD_AND_COMPARE_TO_MXX %xmm1 + LOAD_AND_COMPARE_TO_MXX %xmm2 + LOAD_AND_COMPARE_TO_MXX %xmm3 + LOAD_AND_COMPARE_TO_MXX %xmm4 + LOAD_AND_COMPARE_TO_MXX %xmm5 + LOAD_AND_COMPARE_TO_MXX %xmm6 + LOAD_AND_COMPARE_TO_MXX %xmm7 + LOAD_AND_COMPARE_TO_MXX %xmm8 orps %xmm2, %xmm1 orps %xmm4, %xmm3 From 2c242b4cefbd9e9d238fe50cb6ee90ef72f4e561 Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Mon, 17 Feb 2020 11:49:53 -0500 Subject: [PATCH 915/935] Add Github Action to build development branch nightly with Homebrew --- .github/workflows/nightly-Homebrew-build.yml | 63 ++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 .github/workflows/nightly-Homebrew-build.yml diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml new file mode 100644 index 000000000..a5778e2eb --- /dev/null +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -0,0 +1,63 @@ +on: + push: + pull_request: + branches: + - develop + schedule: + - cron: 45 7 * * * + +name: Nightly-Homebrew-Build +jobs: + build-OpenBLAS-with-Homebrew: + runs-on: macos-latest + env: + HOMEBREW_DEVELOPER: "ON" + HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" + HOMEBREW_NO_ANALYTICS: "ON" + HOMEBREW_NO_AUTO_UPDATE: "ON" + HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON" + HOMEBREW_NO_INSTALL_CLEANUP: "ON" + + steps: + - name: Random delay for cron job + run: | + delay=$(( RANDOM % 600 )) + printf 'Delaying for %s seconds on event %s' ${delay} "${{ github.event_name }}" + sleep ${delay} + if: github.event_name == 'schedule' + + - uses: actions/checkout@v2 + + - name: Update Homebrew + if: github.event_name != 'pull_request' + run: brew update || true + + - name: Install prerequisites + run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas + + - name: Install and bottle OpenBLAS + run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas + + - name: Create bottle + run: brew bottle -v openblas + + - name: Upload bottle + uses: actions/upload-artifact@v1 + with: + name: openblas--HEAD.catalina.bottle.tar.gz + paht: ./*.bottle.* + + - name: Show linkage + run: brew linkage -v openblas + + - name: Test openblas + run: brew test --HEAD --verbose openblas + + - name: Audit openblas formula + run: | + brew audit --strict openblas + brew cat openblas + + - name: Post logs on failure + if: failure() + run: brew gist-logs --with-hostname -v openblas From 0b4480216421b7ea6f78b9c92bd2233e52ff79ab Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Mon, 17 Feb 2020 13:12:50 -0500 Subject: [PATCH 916/935] Test push & PRs only when workflow file changes Also, add comments to clarify what the test is testing --- .github/workflows/nightly-Homebrew-build.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index a5778e2eb..b55ae9daf 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -1,10 +1,20 @@ +# Only the "head" branch of the OpenBLAS package is tested + on: push: + paths: + - '**/nightlyHomebrew-build.yml' pull_request: branches: - develop + paths: + - '**/nightly-Homebrew-build.yml' schedule: - cron: 45 7 * * * +# This is 7:45 AM UTC daily, late at night in the USA + +# Since push and pull_request will still always be building and testing the `develop` branch, +# it only makes sense to test if this file has been changed name: Nightly-Homebrew-Build jobs: @@ -27,6 +37,7 @@ jobs: if: github.event_name == 'schedule' - uses: actions/checkout@v2 + # This isn't even needed, technically. Homebrew will get `develop` via git - name: Update Homebrew if: github.event_name != 'pull_request' @@ -37,6 +48,7 @@ jobs: - name: Install and bottle OpenBLAS run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas + # the HEAD flags tell Homebrew to build the develop branch fetch via git - name: Create bottle run: brew bottle -v openblas From 1a88c4ab263d1d4e73bfb5721fb2e2c803a6c1af Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Mon, 17 Feb 2020 13:32:33 -0500 Subject: [PATCH 917/935] Fix bottle upload problem & typo --- .github/workflows/nightly-Homebrew-build.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index b55ae9daf..f55e73d23 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -3,7 +3,7 @@ on: push: paths: - - '**/nightlyHomebrew-build.yml' + - '**/nightly-Homebrew-build.yml' pull_request: branches: - develop @@ -51,13 +51,16 @@ jobs: # the HEAD flags tell Homebrew to build the develop branch fetch via git - name: Create bottle - run: brew bottle -v openblas + run: | + brew bottle -v openblas + mkdir bottles + mv *.bottle.tar.gz bottles - name: Upload bottle uses: actions/upload-artifact@v1 with: name: openblas--HEAD.catalina.bottle.tar.gz - paht: ./*.bottle.* + path: bottles - name: Show linkage run: brew linkage -v openblas From 9f39f0a2c3df9d517932c9bd808755bbf5551383 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Feb 2020 19:55:39 +0100 Subject: [PATCH 918/935] Specify ismin/ismax assembly kernels for POWER8 directly to fix utest failure in new ismin test - Makefile.L1 defaults look wrong --- kernel/power/KERNEL.POWER8 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index ba9a99cd1..d3ae5def4 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -88,7 +88,10 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c # #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c -# + +ISMINKERNEL = imin.S +ISMAXKERNEL = imax.S + ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power8.S From 130c1741e5bba4bffb84e0057149b9c7b353bdee Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 18 Feb 2020 10:22:49 -0800 Subject: [PATCH 919/935] Fix install name on osx again --- Makefile.install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index e01d866c9..2dc32c3d9 100644 --- a/Makefile.install +++ b/Makefile.install @@ -82,7 +82,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) endif ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" - @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" + @-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib From 76b2cec6ce9a80adde9ffe6ad31f5771e8dc26ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:08:20 +0100 Subject: [PATCH 920/935] Get endianness into Makefile variable --- Makefile.system | 1 + getarch.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index cf9e9bafa..2073f56bb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1328,6 +1328,7 @@ export OSNAME export ARCH export CORE export LIBCORE +export __BYTE_ORDER__ export PGCPATH export CONFIG export CC diff --git a/getarch.c b/getarch.c index 1f590390a..53748897f 100644 --- a/getarch.c +++ b/getarch.c @@ -1298,6 +1298,13 @@ int main(int argc, char *argv[]){ #endif #endif +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); +#endif +#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 +printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); +#endif + #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 printf("MAKE += -j %d\n", MAKE_NB_JOBS); From 0b39cf95b022c3902cf78e4f3df2d11468f8a7fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:09:54 +0100 Subject: [PATCH 921/935] Fix endianness conditionals --- kernel/power/KERNEL.POWER8 | 10 +++++----- kernel/power/KERNEL.PPC440 | 8 ++++---- kernel/power/KERNEL.PPC970 | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index d3ae5def4..c7867012b 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -92,7 +92,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ISMINKERNEL = imin.S ISMAXKERNEL = imax.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power8.S else @@ -104,7 +104,7 @@ endif # IDAMAXKERNEL = idamax.c # -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ICAMAXKERNEL = icamax_power8.S else @@ -116,7 +116,7 @@ endif # IZAMAXKERNEL = izamax.c # -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ISAMINKERNEL = isamin_power8.S else @@ -128,7 +128,7 @@ endif # IDAMINKERNEL = idamin.c # -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) ICAMINKERNEL = icamin_power8.S else @@ -153,7 +153,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power8.S else diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 8d64d3fc1..677af5f21 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -15,7 +15,7 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c else @@ -25,7 +25,7 @@ endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S else @@ -62,7 +62,7 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S -ifneq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S else @@ -132,7 +132,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index cc1f215de..a99fb7d96 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,4 +1,4 @@ -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMMKERNEL = gemm_kernel.S SGEMMINCOPY = SGEMMITCOPY = @@ -30,7 +30,7 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CGEMMKERNEL = zgemm_kernel.S CGEMMINCOPY = CGEMMITCOPY = @@ -72,7 +72,7 @@ ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S -ifeq ($(__BYTE_ORDER__),"__ORDER_BIG_ENDIAN__") +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S From e8d82c01d4b78612991569e7734509c7f8dc7936 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:49:13 +0100 Subject: [PATCH 922/935] Recognize Ampere EMAG8180 --- cpuid_arm64.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 9e019fe3e..5868af75c 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -41,6 +41,8 @@ #define CPU_THUNDERX2T99 8 //Hisilicon #define CPU_TSV110 9 +// Ampere +#define CPU_EMAG8180 10 static char *cpuname[] = { "UNKNOWN", @@ -52,7 +54,8 @@ static char *cpuname[] = { "FALKOR", "THUNDERX", "THUNDERX2T99", - "TSV110" + "TSV110", + "EMAG8180" }; static char *cpuname_lower[] = { @@ -65,7 +68,8 @@ static char *cpuname_lower[] = { "falkor", "thunderx", "thunderx2t99", - "tsv110" + "tsv110", + "emag8180" }; int get_feature(char *search) @@ -152,6 +156,9 @@ int detect(void) // HiSilicon else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) return CPU_TSV110; + // Ampere + else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) + return CPU_EMAG8180; } p = (char *) NULL ; @@ -335,6 +342,18 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; + + case CPU_EMAG8180: + // Minimum parameters for ARMv8 (based on A53) + printf("#define EMAG8180\n"); + printf("#define L1_CODE_SIZE 32768\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + } get_cpucount(); } From 71e5669c3ea60ddc524ef16f5066d88b8ab37b04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 18:57:26 +0100 Subject: [PATCH 923/935] Add preliminary support for EMAG8180 ARMV8 processor --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index e6ab93aa5..055749dc1 100644 --- a/param.h +++ b/param.h @@ -2603,7 +2603,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA53) || defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From e57b11accae62804f0d7a7fe2d9f15a9e19932c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2020 19:00:28 +0100 Subject: [PATCH 924/935] Add preliminary support for EMAG8180 --- kernel/arm64/KERNEL.EMAG8180 | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 kernel/arm64/KERNEL.EMAG8180 diff --git a/kernel/arm64/KERNEL.EMAG8180 b/kernel/arm64/KERNEL.EMAG8180 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.EMAG8180 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + From 4046985913ad8b62fc997da2cd60447af4bdf093 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 21 Feb 2020 11:55:52 +0100 Subject: [PATCH 925/935] Add proper defaults for IxMIN/IxMAX kernels the fallbacks from Makefile.L1 assume a combined source for absolute value and non-absolute (with ifdef USE_ABS) but here we have separate implementations --- kernel/power/KERNEL | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index c3c86b310..9070450f4 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -50,3 +50,26 @@ ifndef DSDOTKERNEL DSDOTKERNEL = ../generic/dot.c endif +ifndef ISMINKERNEL +ISMINKERNEL = imin.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = imin.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = imin.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = imax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = imax.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = imax.S +endif From 07454bf4d55868a388733a4377fedbd8a56f15c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 21 Feb 2020 11:58:15 +0100 Subject: [PATCH 926/935] Add proper defaults for IxMIN/IxMAX kernels the fallbacks from Makefile.L1 assume a combined source for absolute value and non-absolute (with ifdef USE_ABS) but here we have separate implementations --- kernel/mips64/KERNEL | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 61da7445f..97ef3692c 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -167,3 +167,27 @@ endif CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S + +ifndef ISMINKERNEL +ISMINKERNEL = imin.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = imin.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = imin.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = imax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = imax.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = imax.S +endif From 9e40c080f2c820d9e899182a8f8cb9b7d400bc55 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Feb 2020 22:39:01 +0100 Subject: [PATCH 927/935] Apply fix from Reference-LAPACK PR390, NaN not propagating --- lapack-netlib/SRC/dcombssq.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapack-netlib/SRC/dcombssq.f b/lapack-netlib/SRC/dcombssq.f index 79f6d95c9..7a1ddd1af 100644 --- a/lapack-netlib/SRC/dcombssq.f +++ b/lapack-netlib/SRC/dcombssq.f @@ -80,6 +80,8 @@ IF( V1( 1 ).GE.V2( 1 ) ) THEN IF( V1( 1 ).NE.ZERO ) THEN V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) + ELSE + V1( 2 ) = V1( 2 ) + V2( 2 ) END IF ELSE V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) From 87ac1ceb0baaaeec38fc108d537dd8ad5a3b679f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Feb 2020 22:40:40 +0100 Subject: [PATCH 928/935] Apply fix from Reference-LAPACK PR390, NaN not propagating --- lapack-netlib/SRC/scombssq.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapack-netlib/SRC/scombssq.f b/lapack-netlib/SRC/scombssq.f index 76bc0e320..cc51a324b 100644 --- a/lapack-netlib/SRC/scombssq.f +++ b/lapack-netlib/SRC/scombssq.f @@ -80,6 +80,8 @@ IF( V1( 1 ).GE.V2( 1 ) ) THEN IF( V1( 1 ).NE.ZERO ) THEN V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 ) + ELSE + V1( 2 ) = V1( 2 ) + V2( 2 ) END IF ELSE V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 ) From c93ae92579c8b67908b4324f562332d76ebb6b75 Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Mon, 24 Feb 2020 11:23:39 +0800 Subject: [PATCH 929/935] =?UTF-8?q?[OpenBlas]:benchmark/copy.c=20has=20tim?= =?UTF-8?q?e=EF=BC=8Cx,y=20data=20loop=20problems?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark/copy.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/benchmark/copy.c b/benchmark/copy.c index ea5b38d68..d7f58c94f 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -129,7 +129,10 @@ int main(int argc, char *argv[]){ int step = 1; struct timeval start, stop; - double time1,timeg; + double time1 = 0.0, timeg = 0.0; + long nanos = 0; + time_t seconds = 0; + struct timespec time_start = { 0, 0 }, time_end = { 0, 0 }; argc--;argv++; @@ -163,35 +166,32 @@ int main(int argc, char *argv[]){ timeg=0; fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } for (l=0; l Date: Mon, 24 Feb 2020 19:20:00 +0100 Subject: [PATCH 930/935] Add DYNAMIC_ARCH support for ARMV8 EMAG8180 --- driver/others/dynamic_arm64.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 72f5fcca2..d0664ebcb 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -51,10 +51,11 @@ extern gotoblas_t gotoblas_FALKOR; extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; +extern gotoblas_t gotoblas_EMAG8180; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 9 +#define NUM_CORETYPES 10 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -78,6 +79,7 @@ static char *corename[] = { "thunderx", "thunderx2t99", "tsv110", + "emag8180", "unknown" }; @@ -91,6 +93,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_THUNDERX) return corename[ 6]; if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7]; if (gotoblas == &gotoblas_TSV110) return corename[ 8]; + if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; return corename[NUM_CORETYPES]; } @@ -119,6 +122,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 6: return (&gotoblas_THUNDERX); case 7: return (&gotoblas_THUNDERX2T99); case 8: return (&gotoblas_TSV110); + case 9: return (&gotoblas_EMAG8180); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -189,6 +193,13 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_TSV110; } break; + case 0x50: // Ampere + switch (part) + { + case 0x000: // Skylark/EMAG8180 + return &gotoblas_EMAG818; + } + break; case 0x51: // Qualcomm switch (part) { From 320e2648cd90e14c77e015e3cb912e37ce4c0b94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 19:23:46 +0100 Subject: [PATCH 931/935] Add EMAG8180 to DYNAMIC_CORE list for ARM64 --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index cf9e9bafa..65027bd20 100644 --- a/Makefile.system +++ b/Makefile.system @@ -558,6 +558,7 @@ DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 +DYNAMIC_CORE += EMAG8180 endif ifeq ($(ARCH), zarch) From 4c5fac5a2bad43ef6ffb333fc9a901e163588de1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 20:15:04 +0100 Subject: [PATCH 932/935] Typo fix --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index d0664ebcb..9f42ce4c6 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -197,7 +197,7 @@ static gotoblas_t *get_coretype(void) { switch (part) { case 0x000: // Skylark/EMAG8180 - return &gotoblas_EMAG818; + return &gotoblas_EMAG8180; } break; case 0x51: // Qualcomm From 1ddf9f1067a7abb247ff659d4bda54ea62aeb758 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 20:16:18 +0100 Subject: [PATCH 933/935] Add EMAG8180 to arm64 DYNAMIC_ARCH list for cmake --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 8280d6274..d31961c14 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180) endif () if (POWER) From ca4f7dceff13fbdc2fb48d17641d4065d3a4edab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 24 Feb 2020 20:23:18 +0100 Subject: [PATCH 934/935] Add parameters for EMAG8180 DYNAMIC_ARCH support with cmake --- cmake/prebuild.cmake | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index c6d109356..b74a0699b 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -332,6 +332,29 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "EMAG8180") + file(APPEND ${TARGET_CONF_TEMP} + "#define ARMV8\n" + "#define L1_CODE_SIZE\t32768\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t5262144\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "POWER6") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" From f8ec538c82e5c5b9e363d4064360c6699a956f79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 25 Feb 2020 14:30:00 +0100 Subject: [PATCH 935/935] Add Ampere EMAG8180 --- getarch.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/getarch.c b/getarch.c index 1f590390a..c1003c2d1 100644 --- a/getarch.c +++ b/getarch.c @@ -1093,6 +1093,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_EMAG8180 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "EMAG8180" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DEMAG8180 " \ + "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "emag8180" +#define CORENAME "EMAG8180" +#endif #ifdef FORCE_ZARCH_GENERIC #define FORCE